summaryrefslogtreecommitdiff
path: root/compute/ARMComputeEx
diff options
context:
space:
mode:
Diffstat (limited to 'compute/ARMComputeEx')
-rw-r--r--compute/ARMComputeEx/CMakeLists.txt2
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h12
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h115
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h81
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h6
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h101
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h6
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h107
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h152
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h124
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h8
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h84
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h135
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h86
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h (renamed from compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h)75
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h (renamed from compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h)74
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h3
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h140
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h108
-rw-r--r--compute/ARMComputeEx/arm_compute/core/TypesEx.h9
-rw-r--r--compute/ARMComputeEx/arm_compute/core/UtilsEx.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h44
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h5
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h109
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h1
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h (renamed from compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPOneHotEx.h)43
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h95
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h8
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h31
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h4
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h7
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h3
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h122
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h130
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h9
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h89
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h92
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h4
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEActivationLayerEx.h103
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h114
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h77
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h4
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h2
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h14
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h4
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h1
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h4
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h3
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h93
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h12
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h4
-rw-r--r--compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h61
-rw-r--r--compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp193
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h96
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl564
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl4
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl97
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl12
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl7210
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h1235
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl2733
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl10
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl12
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h22
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h244
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl20
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl88
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl4
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl222
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl346
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl2
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl10
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl18
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h223
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl102
-rw-r--r--compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl292
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp332
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp36
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp135
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp17
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp22
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp21
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp14
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp133
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp26
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp7
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp189
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp292
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp26
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp21
-rw-r--r--compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp14
-rw-r--r--compute/ARMComputeEx/src/core/CPP/kernels/CPPOneHotKernelEx.cpp119
-rw-r--r--compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp362
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEActivationLayerKernelEx.cpp730
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp253
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp347
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp30
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp190
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp143
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp51
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp225
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp56
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp233
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp101
-rw-r--r--compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp693
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp224
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp5
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp (renamed from compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp)14
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp60
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp3
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp62
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp92
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp6
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp171
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp4
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp2
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp2
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp2
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp75
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp110
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp21
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp197
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp16
-rw-r--r--compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp18
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp86
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp56
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp3
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp55
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp87
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp4
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp3
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp3
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp10
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp (renamed from compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp)31
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp18
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp14
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp173
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp59
154 files changed, 18413 insertions, 4604 deletions
diff --git a/compute/ARMComputeEx/CMakeLists.txt b/compute/ARMComputeEx/CMakeLists.txt
index 58f558db2..c8d12c249 100644
--- a/compute/ARMComputeEx/CMakeLists.txt
+++ b/compute/ARMComputeEx/CMakeLists.txt
@@ -14,7 +14,7 @@ file(GLOB_RECURSE ACL_EX_SRCS "${ACL_EX_BASE}/*.cpp")
# generate embeded cl_kernel
execute_process (
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
- COMMAND bash -c "python resolve_includes.py"
+ COMMAND bash -c "python3 resolve_includes.py"
)
add_library(arm_compute_ex SHARED ${ACL_EX_SRCS})
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
index d29886a9d..d3e116381 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
@@ -255,14 +255,14 @@ private:
cl::Device _device; /**< Underlying CL device. */
std::string _kernel_path; /**< Path to the kernels folder. */
mutable std::map<std::string, const Program>
- _programs_map; /**< Map with all already loaded program data. */
+ _programs_map; /**< Map with all already loaded program data. */
mutable std::map<std::string, cl::Program>
- _built_programs_map; /**< Map with all already built program data. */
+ _built_programs_map; /**< Map with all already built program data. */
static const std::map<std::string, std::string>
- _kernel_program_map; /**< Map that associates kernel names with programs. */
+ _kernel_program_map; /**< Map that associates kernel names with programs. */
static const std::map<std::string, std::string>
- _program_source_map; /**< Contains sources for all programs.
- Used for compile-time kernel inclusion. >*/
+ _program_source_map; /**< Contains sources for all programs.
+ Used for compile-time kernel inclusion. >*/
};
-}
+} // namespace arm_compute
#endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
new file mode 100644
index 000000000..46d4ae858
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H
+#define ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H
+
+#include "src/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the reduction operation kernel
+ *
+ * @note The default data type for an uninitialized output tensor is
+ * signed 32-bit integer (S32). It is the user's responsibility to check
+ * that the results do not overflow because the indices are computed
+ * in unsigned 32-bit (U32).
+ */
+class CLArgMinMaxLayerKernelEx : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLArgMinMaxLayerKernelEx();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLArgMinMaxLayerKernelEx(const CLArgMinMaxLayerKernelEx &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLArgMinMaxLayerKernelEx &operator=(const CLArgMinMaxLayerKernelEx &) = delete;
+ /** Allow instances of this class to be moved */
+ CLArgMinMaxLayerKernelEx(CLArgMinMaxLayerKernelEx &&) = default;
+ /** Allow instances of this class to be moved */
+ CLArgMinMaxLayerKernelEx &operator=(CLArgMinMaxLayerKernelEx &&) = default;
+ /** Default destructor */
+ ~CLArgMinMaxLayerKernelEx() = default;
+
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. Data types supported: S32/F16/F32.
+ * @param[in] prev_output Destination tensor of the previous iterations of @ref
+ * CLArgMinMaxLayerKernelEx. Data types supported: U32/S32
+ * Has to be nullptr for the first iteration
+ * @param[out] output Destination tensor. Data types supported: U32/S32
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
+ * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported.
+ */
+ void configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output,
+ unsigned int axis, ReductionOperation op);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLArgMinMaxLayerKernelEx.
+ *
+ * @param[in] input Source tensor info. Data types supported: S32/F16/F32.
+ * @param[in] prev_output Destination tensor info of the previous iterations. Data types
+ * supported: U32/S32
+ * Has to be nullptr for the first iteration
+ * @param[in] output Destination tensor info. Data types supported: U32/S32
+ * Output will have the same number of dimensions as input.
+ * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3
+ * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *prev_output,
+ const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ const ICLTensor *_prev_output;
+ ICLTensor *_output;
+ unsigned int _reduction_axis;
+ ReductionOperation _op;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
index bb6fcb8f5..eac866b67 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
@@ -41,8 +41,8 @@
#ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
#define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
-#include "arm_compute/core/CL/ICLKernel.h"
#include "arm_compute/core/TypesEx.h"
+#include "src/core/CL/ICLKernel.h"
namespace arm_compute
{
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h
new file mode 100644
index 000000000..cf671102e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * @file CLCastBoolKernel.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file defines CLCastBoolKernel class
+ */
+
+#ifndef __ARM_COMPUTE_CLCASTBOOLKERNEL_H__
+#define __ARM_COMPUTE_CLCASTBOOLKERNEL_H__
+
+#include "src/core/CL/ICLSimple3DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/**
+ * @brief Class for the kernel converting boolean type
+ */
+class CLCastBoolKernel : public ICLSimple3DKernel
+{
+public:
+ /**
+ * @brief Initialise the kernel's input and output.
+ * @param[in] input Input tensor. Data types supported: U8
+ * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+ * @return N/A
+ */
+ void configure(const ICLTensor *input, ICLTensor *output);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLCastBoolKernel
+ *
+ * @param[in] input Source tensor info. Data types supported: U8.
+ * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLCASTBOOLKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
index a614d5259..6729fb0f1 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
@@ -47,15 +47,15 @@
#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
#define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
namespace arm_compute
{
class ICLTensor;
/**
-* @brief Class to perform EmbeddingLookup operation with opencl kernel
-*/
+ * @brief Class to perform EmbeddingLookup operation with opencl kernel
+ */
class CLEmbeddingLookupKernel : public ICLKernel
{
public:
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
new file mode 100644
index 000000000..64908ab59
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H
+#define ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+/** Interface to add a bias to each row of the input tensor
+ *
+ */
+class CLGEMMMatrixAccumulateBiasesKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLGEMMMatrixAccumulateBiasesKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLGEMMMatrixAccumulateBiasesKernel(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLGEMMMatrixAccumulateBiasesKernel &
+ operator=(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLGEMMMatrixAccumulateBiasesKernel(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLGEMMMatrixAccumulateBiasesKernel &operator=(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
+ /** Set the accumulate buffer and the biases of the kernel.
+ *
+ * @param[in, out] accum The accumulate tensor to convert. Data types supported: F16/F32
+ * @param[in] biases The shared biases tensor to append. It must be 1D tensor. Data types
+ * supported: Same as @p input
+ */
+ void configure(ICLTensor *accum, const ICLTensor *biases);
+ /** Set the accumulate buffer and the biases of the kernel.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in, out] accum The accumulate tensor to convert. Data types supported: F16/F32
+ * @param[in] biases The shared biases tensor to append. It must be 1D tensor. Data
+ * types supported: Same as @p input
+ */
+ void configure(const CLCompileContext &compile_context, ICLTensor *accum,
+ const ICLTensor *biases);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLGEMMMatrixAccumulateBiasesKernel
+ *
+ * @param[in] accum The accumulate tensor to convert. Data types supported: F16/F32
+ * @param[in] biases The shared biases tensor to append. It must be 1D tensor. Data types
+ * supported: Same as @p input
+ * @param[in] gpu_target GPU target
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ ICLTensor *_accum;
+ const ICLTensor *_biases;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
index 6630c7be7..a55f2401d 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
@@ -47,7 +47,7 @@
#ifndef __ARM_COMPUTE_CLGATHEREXKERNEL_H__
#define __ARM_COMPUTE_CLGATHEREXKERNEL_H__
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
namespace arm_compute
{
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
index 99cfa61ec..f9d6f7cc5 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
@@ -47,7 +47,7 @@
#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
#define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
#include "arm_compute/runtime/CL/CLTensor.h"
namespace arm_compute
@@ -55,8 +55,8 @@ namespace arm_compute
class ICLTensor;
/**
-* @brief Class to perform HashtableLookup operation with opencl kernel
-*/
+ * @brief Class to perform HashtableLookup operation with opencl kernel
+ */
class CLHashtableLookupKernel : public ICLKernel
{
public:
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
index f57e799ad..7da9e9a4c 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
@@ -41,7 +41,7 @@
#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__
#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
namespace arm_compute
{
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h
new file mode 100644
index 000000000..4befdd05c
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLMEMSETKERNEL_H
+#define ARM_COMPUTE_CLMEMSETKERNEL_H
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for filling the planes of a tensor */
+class CLMemsetKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLMemsetKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLMemsetKernel(const CLMemsetKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLMemsetKernel &operator=(const CLMemsetKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLMemsetKernel(CLMemsetKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLMemsetKernel &operator=(CLMemsetKernel &&) = default;
+ /** Default destructor */
+ ~CLMemsetKernel() = default;
+
+ /** Initialise the kernel's tensor and filling value
+ *
+ * @param[in,out] tensor Input tensor to fill. Supported data types: All.
+ * @param[in] constant_value The value used to fill the planes of the tensor
+ * @param[in] window Window to be used in case setting only part of a tensor. Default
+ * is nullptr.
+ */
+ void configure(ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr);
+ /** Initialise the kernel's tensor and filling value
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in,out] tensor Input tensor to fill. Supported data types: All.
+ * @param[in] constant_value The value used to fill the planes of the tensor
+ * @param[in] window Window to be used in case setting only part of a tensor. Default
+ * is nullptr.
+ */
+ void configure(const CLCompileContext &compile_context, ICLTensor *tensor,
+ const PixelValue &constant_value, Window *window = nullptr);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLMemsetKernel
+ *
+ * @param[in] tensor Source tensor info. Data types supported: All.
+ * @param[in] constant_value The value used to fill the planes of the tensor
+ * @param[in] window Window to be used in case setting only part of a tensor. Default is
+ * nullptr.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value,
+ Window *window = nullptr);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ ICLTensor *_tensor;
+ Window _full_window;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLMEMSETRKERNEL_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h
index 90e8b5705..5394a062c 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h
@@ -41,7 +41,7 @@
#ifndef __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__
#define __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
namespace arm_compute
{
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
index fa383c0d0..384050aff 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
@@ -41,7 +41,7 @@
#ifndef __ARM_COMPUTE_CLNEGKERNEL_H__
#define __ARM_COMPUTE_CLNEGKERNEL_H__
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
namespace arm_compute
{
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h
new file mode 100644
index 000000000..1d64f9f7d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLONEHOTKERNEL_H__
+#define __ARM_COMPUTE_CLONEHOTKERNEL_H__
+#include "src/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+namespace arm_compute
+{
+class ICLTensor;
+/** Interface for the kernel to perform one-hot encoding*/
+class CLOneHotKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLOneHotKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLOneHotKernel(const CLOneHotKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLOneHotKernel &operator=(const CLOneHotKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLOneHotKernel(CLOneHotKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLOneHotKernel &operator=(CLOneHotKernel &&) = default;
+ /** Default destructor */
+ ~CLOneHotKernel() = default;
+ /** Initialise the kernel's inputs and output
+ *
+ * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+ * following types: U32/S32
+ * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported:
+ * U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
+ * Same as @p on_value
+ * @param[out] output Destination tensor. Data type supported: Same as @p on_value
+ * @param[in] depth The depth of the one hot dimension.
+ * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+ * value must be in range [-indices.rank , indices.rank)
+ */
+ void configure(const ICLTensor *indices, const ICLTensor *on_value, const ICLTensor *off_value,
+ ICLTensor *output, int depth, int axis = -1);
+ /** Initialise the kernel's inputs and output already initialized to off_value
+ *
+ * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+ * following types: U32/S32
+ * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported:
+ * U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[out] output Destination tensor. Data type supported: Same as @p on_value
+ * @param[in] depth The depth of the one hot dimension.
+ * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+ * value must be in range [-indices.rank , indices.rank)
+ */
+ void configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output, int depth,
+ int axis = -1);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLOneHotKernel
+ *
+ * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+ * following types: U32/S32
+ * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported:
+ * U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
+ * Same as @p on_value
+ * @param[in] output Destination tensor. Data type supported: Same as @p on_value
+ * @param[in] depth The depth of the one hot dimension.
+ * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+ * value must be in range [-indices.rank , indices.rank)
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+ const ITensorInfo *off_value, const ITensorInfo *output, int depth,
+ int axis = -1);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLOneHotKernel without off_value
+ *
+ * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+ * following types: U32/S32
+ * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported:
+ * U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] output Destination tensor. Data type supported: Same as @p on_value
+ * @param[in] depth The depth of the one hot dimension.
+ * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+ * value must be in range [-indices.rank , indices.rank)
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+ const ITensorInfo *output, int depth, int axis = -1);
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ /** Initialise the kernel's inputs and outputs internally
+ *
+ * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+ * following types: U32/S32
+ * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported:
+ * U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[out] output Destination tensor. Data type supported: Same as @p on_value
+ * @param[in] depth The depth of the one hot dimension.
+ * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+ * value must be in range [-indices.rank , indices.rank)
+ */
+ void configure_common(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output,
+ int depth, int axis);
+
+private:
+ const ICLTensor *_indices; /**< Indices tensor */
+ const ICLTensor *_on_value; /**< On value tensor */
+ const ICLTensor *_off_value; /**< Off value tensor */
+ ICLTensor *_output; /**< Destination tensor */
+ bool _is_off_value_memset; /**< Whether off_value is zero */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLONEHOTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h
new file mode 100644
index 000000000..d4230aaf3
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLPADLAYERKERNELEX_H
+#define ARM_COMPUTE_CLPADLAYERKERNELEX_H
+
+#include "src/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the PadLayer function. */
+class CLPadLayerKernelEx : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLPadLayerKernelEx();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLPadLayerKernelEx(const CLPadLayerKernelEx &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLPadLayerKernelEx &operator=(const CLPadLayerKernelEx &) = delete;
+ /** Allow instances of this class to be moved */
+ CLPadLayerKernelEx(CLPadLayerKernelEx &&) = default;
+ /** Allow instances of this class to be moved */
+ CLPadLayerKernelEx &operator=(CLPadLayerKernelEx &&) = default;
+ /** Default destructor */
+ ~CLPadLayerKernelEx() = default;
+ /** Set the input and output tensor.
+ *
+ * @param[in] input Source tensor. Data types supported: U8, S8, QASYMM8,
+ * QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32.
+ * @param[out] output Output tensor. Data type supported: same as @p input
+ * @param[in] padding The padding for each spatial dimension of the input tensor. The pair
+ * padding[i] specifies the front and the end padding in the i-th dimension.
+ * @param[in] constant_value (Optional) Constant value to be used for the padding.
+ * @param[in] mode (Optional) Controls whether the padding should be filled with @p
+ * constant_value using CONSTANT, or reflect the input, either including the border values
+ * (SYMMETRIC) or not (REFLECT).
+ */
+ void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding,
+ PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
+ /** Set the input and output tensor.
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. Data types supported: All.
+ * @param[out] output Output tensor. Data type supported: same as @p input
+ * @param[in] padding The padding for each spatial dimension of the input tensor. The
+ * pair padding[i] specifies the front and the end padding in the i-th dimension.
+ * @param[in] constant_value (Optional) Constant value to be used for the padding.
+ * @param[in] mode (Optional) Controls whether the padding should be filled with @p
+ * constant_value using CONSTANT, or reflect the input, either including the border values
+ * (SYMMETRIC) or not (REFLECT).
+ */
+ void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
+ const PaddingList &padding, PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLPadLayerKernelEx
+ *
+ * @param[in] input Source tensor info. Data types supported: U8, S8, QASYMM8,
+ * QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32.
+ * @param[in] output Output tensor info. Data type supported: same as @p input
+ * @param[in] padding The padding for each spatial dimension of the input tensor. The pair
+ * padding[i] specifies the front and the end padding in the i-th dimension.
+ * @param[in] constant_value (Optional) Constant value to be used for the padding.
+ * @param[in] mode (Optional) Controls whether the padding should be filled with @p
+ * constant_value using CONSTANT, or reflect the input, either including the border values
+ * (SYMMETRIC) or not (REFLECT).
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const PaddingList &padding, PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+ int _input_start_x;
+ int _input_start_y;
+ bool _4d_enabled;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLPADLAYERKERNELEX_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h
index 4e1b56cba..3f60db7bb 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h
@@ -41,7 +41,7 @@
#ifndef __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__
#define __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
namespace arm_compute
{
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
index 9b8a239d3..548f29a27 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
@@ -47,8 +47,8 @@
#ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
#define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/TypesEx.h"
+#include "src/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
namespace arm_compute
{
@@ -95,7 +95,7 @@ public:
* @return N/A
*/
void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis,
- ReduceOperation op);
+ ReductionOperation op);
/**
* @brief Static function to check if given info will lead to a valid configuration of @ref
@@ -108,7 +108,7 @@ public:
* @return a status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
- ReduceOperation op);
+ ReductionOperation op);
/*
* @brief Run CLReduceOperationKernel op
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h
index 4d4478ece..5f5b7f9b8 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h
@@ -41,7 +41,7 @@
#ifndef __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__
#define __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
namespace arm_compute
{
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
index aa4a14812..09073af7c 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
@@ -47,7 +47,7 @@
#ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__
#define __ARM_COMPUTE_CLTOPKV2KERNEL_H__
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
// these parameters can be changed
#define _ITEMS 16 // number of items in a group
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h
deleted file mode 100644
index 28114f8b5..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__
-#define __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-class ITensor;
-class Window;
-class QuantizationInfo;
-} // namespace arm_compute
-
-namespace arm_compute
-{
-
-float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset,
- const float32x4_t &scale);
-
-void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset,
- const float32x4_t &invscale);
-
-float32x4x4_t dup_quantized(uint8_t broadcast_value, int offset, float scale);
-
-void elementwise_op_quantized(
- const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- uint8_t (*scalar_func)(const float &, const float &, QuantizationInfo),
- int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t,
- float32x4_t, float32x4_t, float32x4_t, const bool),
- int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t,
- int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t));
-
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- float (*scalar_func)(const float &, const float &),
- int (*broadcast_func)(int, int, int, const float *, const float &, float *,
- const bool),
- int (*neon_func)(int, int, int, const float *, const float *, float *));
-
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- uint8_t (*scalar_func)(const uint8_t &, const uint8_t &),
- int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &,
- uint8_t *, const bool),
- int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *));
-} // namespace arm_compute
-#endif // __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h
deleted file mode 100644
index a827f48f8..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEACTIVATIONLAYERKERNELEX_H__
-#define __ARM_COMPUTE_NEACTIVATIONLAYERKERNELEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include <arm_fp16.h>
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the activation layer kernel. */
-class NEActivationLayerKernelEx : public INEKernel
-{
-public:
- const char *name() const override { return "NEActivationLayerKernelEx"; }
- /** Constructor */
- NEActivationLayerKernelEx();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEActivationLayerKernelEx(const NEActivationLayerKernelEx &) = delete;
- /** Default move constructor */
- NEActivationLayerKernelEx(NEActivationLayerKernelEx &&) = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEActivationLayerKernelEx &operator=(const NEActivationLayerKernelEx &) = delete;
- /** Default move assignment operator */
- NEActivationLayerKernelEx &operator=(NEActivationLayerKernelEx &&) = default;
- /** Set the input and output tensor.
- *
- * @note If the output tensor is a nullptr, the activation function will be performed in-place
- *
- * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this
- * tensor will store the result
- * of the activation function. Data types supported:
- * QASYMM8/QSYMM16/F16/F32.
- * @param[out] output Destination tensor. Data type supported: same as @p input
- * @param[in] activation_info Activation layer information.
- */
- void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info);
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NEActivationLayerKernelEx
- *
- * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor
- * will store the result
- * of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32.
- * @param[in] output Destination tensor info. Data type supported: same as @p input
- * @param[in] act_info Activation layer information.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const ActivationLayerInfo &act_info);
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-private:
- using ActivationFunction = ActivationLayerInfo::ActivationFunction;
- /** Common signature for all the specialised @ref NEActivationLayerKernelEx functions
- *
- * @param[in] window Region on which to execute the kernel.
- */
- using ActivationFunctionExecutorPtr = void (NEActivationLayerKernelEx::*)(const Window &window);
- /** Function to apply an activation function on a tensor.
- *
- * @param[in] window Region on which to execute the kernel
- */
- template <ActivationLayerInfo::ActivationFunction F, typename T>
- typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
- activation(const Window &window);
- /** Function to apply an activation function on a tensor.
- *
- * @param[in] window Region on which to execute the kernel
- */
- template <ActivationLayerInfo::ActivationFunction F, typename T>
- typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type
- activation(const Window &window);
- /** Function to apply an activation function on a tensor.
- *
- * @param[in] window Region on which to execute the kernel
- */
- template <ActivationLayerInfo::ActivationFunction F, typename T>
- typename std::enable_if<std::is_same<T, qsymm16_t>::value, void>::type
- activation(const Window &window);
-
-private:
- ITensor *_input;
- ITensor *_output;
- ActivationFunctionExecutorPtr _func;
- ActivationLayerInfo _act_info;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEACTIVATIONLAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
deleted file mode 100644
index 8c544cda8..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__
-#define __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__
-
-#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-
-class NEBinaryLogicalOperationKernel : public NEElementwiseOperationKernel
-{
-public:
- /** Default destructor */
- ~NEBinaryLogicalOperationKernel() = default;
-
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NEBinaryLogicalOperationKernel
- *
- * @param[in] op Binary logical operation to be executed.
- * @param[in] input1 First tensor input. Data types supported: QASYMM8/U8.
- * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
- * @param[in] output Output tensor. Data types supported: Same as @p input1.
- */
- void configure(BinaryLogicalOperation op, const ITensor *input1, const ITensor *input2,
- ITensor *output);
-
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NEBinaryLogicalOperationKernel
- *
- * @param[in] op Binary logical operation to be executed.
- * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8.
- * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
- * @param[in] output Output tensor info. Data types supported: Same as @p input1.
- *
- * @return a Status
- */
- static Status validate(BinaryLogicalOperation op, const ITensorInfo *input1,
- const ITensorInfo *input2, const ITensorInfo *output);
-
-protected:
- // Inherited methods overridden:
- static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
- const ITensorInfo &output);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h
index 1693922b7..036d56e69 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
*/
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,63 +37,58 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+#ifndef __ARM_COMPUTE_NECASTBOOLKERNEL_H__
+#define __ARM_COMPUTE_NECASTBOOLKERNEL_H__
-#ifndef __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__
-#define __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h"
-#include "arm_compute/core/TypesEx.h"
+#include "src/core/NEON/INEKernel.h"
namespace arm_compute
{
class ITensor;
-/** Basic function to simulate a reduction operation. This function calls the following NEON
- * kernels:
- *
- * -# @ref NEFillBorderKernel
- * -# @ref NEReductionOperationKernelEx
- *
+/**
+ * @brief Class for the kernel converting boolean type
*/
-class NEReductionOperationEx : public IFunction
+class NECastBoolKernel : public INEKernel
{
public:
- /** Default constructor */
- NEReductionOperationEx();
- /** Set the input and output tensors.
+ const char *name() const override { return "NECastBoolKernel"; }
+ /** Default constructor*/
+ NECastBoolKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NECastBoolKernel(const NECastBoolKernel &) = delete;
+ /** Default move constructor */
+ NECastBoolKernel(NECastBoolKernel &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NECastBoolKernel &operator=(const NECastBoolKernel &) = delete;
+ /** Default move assignment operator */
+ NECastBoolKernel &operator=(NECastBoolKernel &&) = default;
+ /** Set the input and output of the kernel
*
- * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32.
- * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input.
- * @param[in] axis Dimension along which to reduce.
- * @param[in] op Reduction operation to perform.
+ * Valid conversions Input -> Output :
+ *
+ * - U8 -> U8, S8, U16, S16, U32, S32, F32, F16
+ *
+ * @param[in] input The input tensor to convert. Data types supported: U8
+ * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
*/
- void configure(ITensor *input, ITensor *output, unsigned int axis, ReduceOperation op);
-
+ void configure(const ITensor *input, ITensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref
- * NEReductionOperationEx.
+ * NECastBoolKernel
*
- * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32.
- * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p
- * input.
- * @param[in] axis Dimension along which to reduce.
- * @param[in] op Reduction operation to perform.
+ * @param[in] input Source tensor info. Data types supported: U8
+ * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32.
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis,
- ReduceOperation op);
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output);
// Inherited methods overridden:
- void run() override;
+ void run(const Window &window, const ThreadInfo &info) override;
private:
- NEReductionOperationKernelEx _reduction_kernel;
- NEFillBorderKernel _fill_border_kernel;
- size_t _window_split;
- int _reduction_axis;
+ const ITensor *_input;
+ ITensor *_output;
};
} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__ */
+#endif /*__ARM_COMPUTE_NECASTBOOLKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
index 88f21c96e..621500eb8 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
@@ -41,7 +41,7 @@
#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__
#define __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
#include "arm_compute/core/Types.h"
namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
index 6e8bdc1c2..f8f7ac567 100644
--- a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
*/
/*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,74 +37,56 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+#ifndef ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H
+#define ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H
-#ifndef __ARM_COMPUTE_CPPONEHOTERNEL_H__
-#define __ARM_COMPUTE_CPPONEHOTERNEL_H__
-
-#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "src/core/NEON/INEKernel.h"
namespace arm_compute
{
class ITensor;
-
-/** CPP kernel to perform tensor OneHot operation. */
-class CPPOneHotKernelEx : public ICPPKernel
+/** NEON kernel to add a bias to each row of the input tensor */
+class NEGEMMMatrixAccumulateBiasesKernel : public INEKernel
{
public:
- const char *name() const override { return "CPPOneHotKernelEx"; }
+ const char *name() const override { return "NEGEMMMatrixAccumulateBiasesKernel"; }
/** Default constructor */
- CPPOneHotKernelEx();
+ NEGEMMMatrixAccumulateBiasesKernel();
/** Prevent instances of this class from being copied (As this class contains pointers) */
- CPPOneHotKernelEx(const CPPOneHotKernelEx &) = delete;
+ NEGEMMMatrixAccumulateBiasesKernel(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
/** Prevent instances of this class from being copied (As this class contains pointers) */
- CPPOneHotKernelEx &operator=(const CPPOneHotKernelEx &) = delete;
+ NEGEMMMatrixAccumulateBiasesKernel &
+ operator=(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
/** Allow instances of this class to be moved */
- CPPOneHotKernelEx(CPPOneHotKernelEx &&) = default;
+ NEGEMMMatrixAccumulateBiasesKernel(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
/** Allow instances of this class to be moved */
- CPPOneHotKernelEx &operator=(CPPOneHotKernelEx &&) = default;
+ NEGEMMMatrixAccumulateBiasesKernel &operator=(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
/** Default destructor */
- ~CPPOneHotKernelEx() = default;
-
- /** Set the input and output of the kernel.
+ ~NEGEMMMatrixAccumulateBiasesKernel() = default;
+ /** Set the accumulate buffer and the biases of the kernel.
*
- * @param[in] indices A tensor for indices. Data types supported: S32
- * @param[in] depth A tensor for depth. Data types supported: S32
- * @param[in] on_value A tensor for on_value. Data types supported: F32
- * @param[in] off_value A tensor for off_value. Data types supported: F32*
- * @param[out] output A tensor for computed value of one hot operator
- * @param[in] axis An int value for axis
+ * @param[in, out] accum The accumulate tensor to convert. Data type supported: F32
+ * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type
+ * supported: Same as @p input
*/
- void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
- const ITensor *off_value, ITensor *output, const int axis);
-
+ void configure(ITensor *accum, const ITensor *biases);
/** Static function to check if given info will lead to a valid configuration of @ref
- * CPPOneHotKernelEx
+ * NEGEMMMatrixAccumulateBiasesKernel
*
- * @param[in] indices A tensor for indices. Data types supported: S32
- * @param[in] depth A tensor for depth. Data types supported: S32
- * @param[in] on_value A tensor for on_value. Data types supported: F32
- * @param[in] off_value A tensor for off_value. Data types supported: F32*
- * @param[in] axis An int value for axis
+ * @param[in] accum The accumulate tensor to convert. Data type supported: F32
+ * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type
+ * supported: Same as @p input
*
* @return a status
*/
- static Status validate(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
- const ITensor *off_value, const int axis);
+ static Status validate(const ITensorInfo *accum, const ITensorInfo *biases);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
- bool is_parallelisable() const override;
private:
- /** Template function to run the topKV operation. */
- template <typename T> void run_one_hot();
-
- const ITensor *_indices;
- const ITensor *_depth;
- const ITensor *_on_value;
- const ITensor *_off_value;
- ITensor *_output;
- int _axis;
+ ITensor *_accum;
+ const ITensor *_biases;
};
} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CPPONEHOTKERNEL_H__ */
+#endif /*ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
index e765aa489..a03e08ade 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
@@ -41,7 +41,7 @@
#ifndef __ARM_COMPUTE_NEGATHERKERNELEX_H__
#define __ARM_COMPUTE_NEGATHERKERNELEX_H__
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
#include "arm_compute/core/Types.h"
namespace arm_compute
@@ -126,6 +126,7 @@ private:
const ITensor *_input;
const ITensor *_indices;
int _axis;
+ size_t _indices_rank;
ITensor *_output;
kernel_ptr _func;
};
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
index cb2a485d5..fb3a72725 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
@@ -41,7 +41,7 @@
#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__
#define __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
#include "arm_compute/core/Types.h"
namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
index 8724cc69b..1d786b59e 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
@@ -41,7 +41,7 @@
#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__
#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
namespace arm_compute
{
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
index 198b0be9d..ab534fe96 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
@@ -41,7 +41,7 @@
#ifndef __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__
#define __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
namespace arm_compute
{
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
new file mode 100644
index 000000000..c1c9f7a3c
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEONEHOTKERNEL_H__
+#define __ARM_COMPUTE_NEONEHOTKERNEL_H__
+#include "src/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+/** Kernel to perform other operation on NEON */
+class NEOneHotKernel : public INEKernel
+{
+public:
+ /** Default constructor. */
+ NEOneHotKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ NEOneHotKernel(const NEOneHotKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ NEOneHotKernel &operator=(const NEOneHotKernel &) = delete;
+ /** Allow instances of this class to be moved. */
+ NEOneHotKernel(NEOneHotKernel &&) = default;
+ /** Allow instances of this class to be moved. */
+ NEOneHotKernel &operator=(NEOneHotKernel &&) = default;
+ /** Default detructor */
+ ~NEOneHotKernel() = default;
+ /** Name of the kernel
+ *
+ * @return Kernel name
+ */
+ const char *name() const override { return "NEOneHotKernel"; }
+ /** Initialise the kernel's inputs and outputs
+ *
+ * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+ * following types: U32/S32
+ * @param[in] depth The tensor for depth of the one hot dimension.
+ * Supported tensor rank: up to 3.
+ * Must be one of the following types: U32/S32
+ * @param[in] on_value On value tensor. Supported tensor rank: only 1.
+ * Data type supported: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] off_value Off value tensor. Supported tensor rank: only 1.
+ * Data type supported: Same as @p on_value
+ * @param[out] output Destination tensor. Data type supported: Same as @p on_value
+ * @param[in] axis (Optional) The axis to fill. Negative values wrap around.
+ * Defaults to -1.
+ * The value must be in range [-indices.rank , indices.rank)
+ */
+ void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
+ const ITensor *off_value, ITensor *output, int axis = -1);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEOneHotKernel
+ *
+ * @param[in] indices Indices tensor info. Supported tensor rank: up to 3.
+ * Must be one of the following types: U32/S32
+ * @param[in] depth The tensor info for depth of the one hot dimension.
+ * Supported tensor rank: up to 3.
+ * Must be one of the following types: U32/S32
+ * @param[in] on_value On value tensor info. Supported tensor rank: only 1.
+ * Data type supported: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] off_value Off value tensor info. Supported tensor rank: only 1.
+ * Data type supported: Same as @p on_value
+ * @param[out] output Destination tensor info. Data type supported: Same as @p on_value
+ * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+ * The value must be in range [-indices.rank , indices.rank)
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *indices, const ITensorInfo *depth,
+ const ITensorInfo *on_value, const ITensorInfo *off_value,
+ const ITensorInfo *output, int axis = -1);
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+ /** Implementation of the onehot operation for 0 axis.
+ *
+ * For onehot on the 0 axis an element by element copy is performed.
+ *
+ * @param[in] window Region on which to execute the kernel. (Must be a region of the window
+ * returned by window())
+ * @param[in] info Info about executing thread and CPU.
+ */
+ template <typename U> void onehot_0_axis(const Window &window, const ThreadInfo &info);
+ /** Implementation of the onehot operation.
+ *
+ * For 1<=axis a row-wise copy is taking place.
+ *
+ * @param[in] window Region on which to execute the kernel. (Must be a region of the window
+ * returned by window())
+ * @param[in] info Info about executing thread and CPU.
+ */
+ template <typename U> void onehot_n_axis(const Window &window, const ThreadInfo &info);
+ using kernel_ptr = void (NEOneHotKernel::*)(const Window &window, const ThreadInfo &info);
+ const ITensor *_indices;
+ const ITensor *_depth;
+ const ITensor *_on_value;
+ const ITensor *_off_value;
+ int _axis;
+ ITensor *_output;
+ kernel_ptr _func;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEONEHOTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
index 0b080cf73..1fd5362ae 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
@@ -41,7 +41,7 @@
#ifndef __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__
#define __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
namespace arm_compute
{
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h
deleted file mode 100644
index c9024fbb3..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__
-#define __ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a reduction operation */
-class NEReductionOperationKernelEx : public INEKernel
-{
-public:
- const char *name() const override { return "NEReductionOperationKernelEx"; }
- /** Default constructor */
- NEReductionOperationKernelEx();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEReductionOperationKernelEx(const NEReductionOperationKernelEx &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEReductionOperationKernelEx &operator=(const NEReductionOperationKernelEx &) = delete;
- /** Allow instances of this class to be moved */
- NEReductionOperationKernelEx(NEReductionOperationKernelEx &&) = default;
- /** Allow instances of this class to be moved */
- NEReductionOperationKernelEx &operator=(NEReductionOperationKernelEx &&) = default;
- /** Default destructor */
- ~NEReductionOperationKernelEx() = default;
-
- /** Set the source, destination of the kernel
- *
- * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. Data layouts supported:
- * NCHW.
- * @param[out] output Destination tensor.Data types and data layouts supported: same as @p input.
- * Output will have the same number of dimensions as input.
- * @param[in] axis Axis along which to reduce. Supported reduction axis : 0
- * @param[in] op Reduction operation to perform.
- */
- void configure(const ITensor *input, ITensor *output, unsigned int axis, ReduceOperation op);
-
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NEReductionOperationKernelEx.
- *
- * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. Data layouts
- * supported: NCHW.
- * @param[in] output Destination tensor info.Data types and data layouts supported: same as @p
- * input.
- * Output will have the same number of dimensions as input.
- * @param[in] axis Axis along which to reduce. Supported reduction axis : 0
- * @param[in] op Reduction operation to perform.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis,
- ReduceOperation op);
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
- BorderSize border_size() const override;
-
-private:
- const ITensor *_input;
- ITensor *_output;
- unsigned int _reduction_axis;
- ReduceOperation _op;
- BorderSize _border_size;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/TypesEx.h b/compute/ARMComputeEx/arm_compute/core/TypesEx.h
index faba8a449..cda8a30b1 100644
--- a/compute/ARMComputeEx/arm_compute/core/TypesEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/TypesEx.h
@@ -51,15 +51,6 @@ enum class ArgOperation
MIN,
};
-/** Available reduce operations */
-enum class ReduceOperation
-{
- MAX, /**< Max */
- MEAN, /**< Mean */
- SUM, /**< Sum */
- MIN, /**< Min */
-};
-
/** Available binary logical operations */
enum class BinaryLogicalOperation
{
diff --git a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
index d57e8fcf5..d7ec1b4f0 100644
--- a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
@@ -67,5 +67,5 @@ transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height,
unsigned int kernel_width, unsigned int kernel_height,
const PadStrideInfo &info, unsigned int invalid_right,
unsigned int invalid_top);
-}
+} // namespace arm_compute
#endif /*__ARM_COMPUTE_UTILSEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
index a9ceacbea..2aaab6b3a 100644
--- a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
@@ -72,10 +72,10 @@ namespace shape_calculator
* @return the calculated shape
*/
inline TensorShape compute_transposeconv_upsampled_shape(
- const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info,
- std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right,
- unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right,
- unsigned int &pad_top, unsigned int &pad_bottom)
+ const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info,
+ std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right,
+ unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right,
+ unsigned int &pad_top, unsigned int &pad_bottom)
{
unsigned int sx = info.stride().first;
unsigned int sy = info.stride().second;
@@ -103,7 +103,7 @@ inline TensorShape compute_transposeconv_upsampled_shape(
unsigned int padx_all_except_invallid = padx + info.pad_left() + info.pad_right() - invalid_right;
unsigned int pady_all_except_invallid =
- pady + info.pad_top() + info.pad_bottom() - invalid_bottom;
+ pady + info.pad_top() + info.pad_bottom() - invalid_bottom;
pad_left = (padx_all_except_invallid + 1) / 2 - info.pad_left();
pad_right = pady_all_except_invallid / 2 - info.pad_right() + invalid_right;
pad_top = (padx_all_except_invallid + 1) / 2 - info.pad_top();
@@ -135,7 +135,7 @@ compute_transposeconv_output_shape(const std::pair<unsigned int, unsigned int> &
const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
const int channel_idx =
- get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
const int batch_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
TensorShape out_shape{input_shape};
@@ -160,7 +160,7 @@ inline TensorShape compute_depth_to_space_shape_ex(const ITensorInfo *input, int
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
const int idx_channel =
- get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
TensorShape output_shape{input->tensor_shape()};
output_shape.set(idx_width, input->dimension(idx_width) * block);
@@ -238,6 +238,36 @@ inline TensorShape compute_gather_shape_ex(const TensorShape &input_shape,
return output_shape;
}
+/** Calculate the gather output shape of a tensor
+ *
+ * @param[in] input_shape Input tensor shape
+ * @param[in] indices_shape Indices tensor shape
+ * @param[in] actual_axis The axis to be gathered
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_onehot_shape_ex(const TensorShape &indices_shape, uint32_t depth,
+ uint32_t actual_axis)
+{
+ ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3);
+ ARM_COMPUTE_ERROR_ON(actual_axis > indices_shape.num_dimensions());
+
+ TensorShape output_shape;
+ output_shape.set(actual_axis, depth);
+
+ unsigned int i_shift = 0;
+ for (unsigned int i = 0; i < indices_shape.num_dimensions(); ++i)
+ {
+ if (i == actual_axis)
+ {
+ i_shift++;
+ }
+ output_shape.set(i + i_shift, indices_shape[i]);
+ }
+
+ return output_shape;
+}
+
} // namespace shape_calculator
} // namespace misc
} // namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
index cfbd13436..664b8b3b1 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
@@ -16,14 +16,19 @@
#ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__
#define __ARM_COMPUTE_CLFUNCTIONSEX_H__
+#include <arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h>
#include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h>
+#include <arm_compute/runtime/CL/functions/CLCastBool.h>
#include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h>
#include <arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h>
#include <arm_compute/runtime/CL/functions/CLGatherEx.h>
#include <arm_compute/runtime/CL/functions/CLHashtableLookup.h>
#include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h>
#include <arm_compute/runtime/CL/functions/CLNeg.h>
+#include <arm_compute/runtime/CL/functions/CLOneHot.h>
+#include <arm_compute/runtime/CL/functions/CLPadLayerEx.h>
#include <arm_compute/runtime/CL/functions/CLReduceOperation.h>
+#include <arm_compute/runtime/CL/functions/CLSplitVEx.h>
#include <arm_compute/runtime/CL/functions/CLTopKV2.h>
#include <arm_compute/runtime/CL/functions/CLTransposeConvLayer.h>
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
new file mode 100644
index 000000000..05bcc4075
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__
+#define __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__
+
+#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+
+namespace arm_compute
+{
+class ITensorInfo;
+class ICLTensor;
+
+/** Function to calculate the index of the minimum or maximum values in a
+ * tensor based on an axis.
+ *
+ * @note The default data type for an uninitialized output tensor is
+ * signed 32-bit integer (S32). It is the user's responsibility to check
+ * that the results do not overflow because the indices are computed
+ * in unsigned 32-bit (U32).
+ */
+class CLArgMinMaxLayerEx : public IFunction
+{
+public:
+ /** Default Constructor.
+ *
+ * @param[in] memory_manager (Optional) Memory manager.
+ */
+ CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Input source tensor. Data types supported: QASYMM8/F16/F32.
+ * @param[in] axis Axis to find max/min index.
+ * @param[out] output Output source tensor. Data types supported: U32/S32.
+ * @param[in] op Reduction operation to perform. Operations supported: ARG_IDX_MAX,
+ * ARG_IDX_MIN
+ */
+ void configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLArgMinMaxLayerEx
+ *
+ * @param[in] input Input source tensor info. Data types supported: QASYMM8/F16/F32.
+ * @param[in] axis Axis to find max/min index.
+ * @param[in] output Output source tensor info. Data types supported: U32/S32.
+ * @param[in] op Reduction operation to perform. Operations supported: ARG_IDX_MAX,
+ * ARG_IDX_MIN
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output,
+ const ReductionOperation &op);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ MemoryGroup _memory_group;
+ std::vector<CLTensor> _results_vector;
+ CLTensor _not_reshaped_output;
+ std::vector<CLArgMinMaxLayerKernelEx> _reduction_kernels_vector;
+ CLReshapeLayer _reshape_kernel;
+ unsigned int _num_of_stages;
+ unsigned int _reduction_axis;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
index 88a9b00ec..fc4322798 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
@@ -43,6 +43,7 @@
#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
#include "arm_compute/core/TypesEx.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
namespace arm_compute
{
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPOneHotEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h
index 7930e4e20..854ddce52 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPOneHotEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h
@@ -15,7 +15,7 @@
*/
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -38,31 +38,34 @@
* SOFTWARE.
*/
-#ifndef __ARM_COMPUTE_CPPONEHOT_EX_H__
-#define __ARM_COMPUTE_CPPONEHOT_EX_H__
+/**
+ * @file CLCastBool.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::CLCastBool class
+ */
+
+#ifndef ARM_COMPUTE_CLCASTBOOL_H
+#define ARM_COMPUTE_CLCASTBOOL_H
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
namespace arm_compute
{
-class ITensor;
+class ICLTensor;
-/** Basic function to run @ref CPPOneHot */
-class CPPOneHotEx : public ICPPSimpleFunction
+/**
+ * @brief Class to run @ref CLCastBoolKernel.
+ * This converts the boolean input tensor to the output tensor's type.
+ */
+class CLCastBool : public ICLSimpleFunction
{
public:
- /** Configure the one_hot function
- *
- * @param[in] indices A tensor for indices. Data types supported: S32
- * @param[in] depth A tensor for depth. Data types supported: S32
- * @param[in] on_value A tensor for on_value. Data types supported: F32
- * @param[in] off_value A tensor for off_value. Data types supported: F32
- * @param[out] output A tensor for computed value of one hot operator
- * @param[in] axis An int value for axis
+ /**
+ * @brief Initialise the kernel's input and output
+ * @param[in] input Input tensor. Data types supported: U8
+ * @param[out] output Output tensor. Data types supported: U8/S8/U16/S16/U32/F16/F32.
*/
- void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
- const ITensor *off_value, ITensor *output, const int axis);
+ void configure(ICLTensor *input, ICLTensor *output);
};
-}
-#endif /* __ARM_COMPUTE_CPPONEHOT_EX_H__ */
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLCASTBOOL_H */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
index 409eaf593..026209f69 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
@@ -106,22 +106,24 @@ public:
CLDirectTransposeConvLayer &operator=(CLDirectTransposeConvLayer &&) = default;
/** Set the input, weights, biases and output tensors.
*
- * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs.
- * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
- * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
- * @param[in] bias (Optional) The biases have one dimension.
- * Data type supported: Should match @p input data type, except for
- * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
- * @param[out] output Output tensor. The output has the same number of dimensions as the
- * @p input.
- * @param[in] info Contains padding and policies to be used in the deconvolution, this
- * is decribed in @ref PadStrideInfo.
- * @param[in] invalid_right The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
- * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
- * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input,
+ * and an optional 4th dimension for batch of inputs.
+ * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM].
+ * Data type supported: Same as @p input.
+ * @param[in] bias (Optional) The biases have one dimension.
+ * Data type supported: Should match @p input data type,
+ * except for input of QASYMM8 and QASYMM8_SIGNED type
+ * where biases should be of S32 type
+ * @param[out] output Output tensor.
+ * The output has the same number of dimensions as the @p input.
+ * @param[in] info Contains padding and policies to be used in the deconvolution,
+ * this is decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
+ * @param[in] weights_info (Optional) Weights information needed for
+ * @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with
+ * @ref CLWeightsReshapeKernel.
*
*/
void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
@@ -130,23 +132,24 @@ public:
/** Set the input, weights, biases and output tensors.
*
* @param[in] compile_context The compile context to be used.
- * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and
- * an optional 4th dimension for batch of inputs.
+ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input,
+ * and an optional 4th dimension for batch of inputs.
* Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
- * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data
- * type supported: Same as @p input.
+ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM].
+ * Data type supported: Same as @p input.
* @param[in] bias (Optional) The biases have one dimension.
* Data type supported: Should match @p input data type, except for
- * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+ * input of QASYMM8 and QASYMM8_SIGNED type
+ * where biases should be of S32 type
* @param[out] output Output tensor. The output has the same number of dimensions as
- * the @p input.
+ * the @p input.
* @param[in] info Contains padding and policies to be used in the deconvolution,
- * this is decribed in @ref PadStrideInfo.
- * @param[in] invalid_right The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
- * @param[in] weights_info (Optional) Weights information needed for @ref
- * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
- * CLWeightsReshapeKernel.
+ * this is decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
+ * @param[in] weights_info (Optional) Weights information needed for
+ * @ref CLConvolutionLayer, specifies if the weights tensor has
+ * been reshaped with @ref CLWeightsReshapeKernel.
*
*/
void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
@@ -154,24 +157,26 @@ public:
unsigned int invalid_right, unsigned int invalid_bottom,
const WeightsInfo &weights_info = WeightsInfo());
/** Static function to check if given info will lead to a valid configuration of @ref
- * CLDirectTransposeConvLayer
+ * CLDirectTransposeConvLayer
*
- * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs.
- * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
- * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data
- * type supported: Same as @p input.
- * @param[in] bias (Optional) The biases have one dimension.
- * Data type supported: Should match @p input data type, except for input
- * of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
- * @param[in] output Output tensor info. The output has the same number of dimensions as the
- * @p input.
- * @param[in] info Contains padding and policies to be used in the deconvolution, this is
- * decribed in @ref PadStrideInfo.
- * @param[in] invalid_right The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
- * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
- * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+ * @param[in] input Input tensor info. 3 lower dimensions represent a single input,
+ * and an optional 4th dimension for batch of inputs.
+ * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+ * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM].
+ * Data type supported: Same as @p input.
+ * @param[in] bias (Optional) The biases have one dimension.
+ * Data type supported: Should match @p input data type,
+ * except for input of QASYMM8 and QASYMM8_SIGNED type
+ * where biases should be of S32 type
+ * @param[in] output Output tensor info. The output has the same number of dimensions
+ * as the @p input.
+ * @param[in] info Contains padding and policies to be used in the deconvolution,
+ * this is decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
+ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped
+ * with @ref CLWeightsReshapeKernel.
*
* @return a status
*/
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
index fbee7e40e..b0149cb09 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
@@ -73,5 +73,5 @@ public:
*/
void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups);
};
-}
+} // namespace arm_compute
#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
index f3266f688..c75ae9a50 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
@@ -43,14 +43,14 @@
#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
#include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h"
#include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h"
#include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h"
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
-#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
namespace arm_compute
{
@@ -182,5 +182,5 @@ private:
bool _is_prepared;
const ICLTensor *_original_weights;
};
-}
+} // namespace arm_compute
#endif /* __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
index e65a646dc..c08da526a 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
@@ -43,16 +43,14 @@
#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
#include "arm_compute/runtime/CL/functions/CLGEMM.h"
#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
#include "arm_compute/runtime/IWeightsManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
namespace arm_compute
{
@@ -132,9 +130,6 @@ private:
* transpose_weights is set to true ) (called once)
* -# @ref CLGEMMMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized
* asymmetric)
- * -# @ref CLGEMMMatrixAccumulateBiasesKernel or @ref
- * CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
- * not equal to nullptr)
*
* @note The fully connected layer accepts "weights" tensors only with 2 dimensions.
*/
@@ -157,40 +152,36 @@ public:
* @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32.
* @param[in] weights Weights tensor. The weights must be 2 dimensional.
* If this function is called after a Convolution Layer, the (transposed)
- * weights will have as many rows as the product of the first 3 input's dimensions.
- * If it is called after another FullyConnected Layer, the (transposed)
- * weights will have as many rows as the input's first dimension.
- * Data type supported: Same as @p input.
+ * weights will have as many rows as the product of the first 3 input's dimensions. If it is
+ * called after another FullyConnected Layer, the (transposed) weights will have as many rows as
+ * the input's first dimension. Data type supported: Same as @p input.
* @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input.
* @param[out] output Destination tensor. Its shape should be equal to the output of a matrix
* multiplication between:
* - The output of im2col on the input and the (transposed) 2D weights, if the
* function is called after a Convolution Layer
* - The input tensor and the (transposed) 2D weights, if the function is
- * called after another FullyConnected Layer.
- * Data type supported: Same as @p input.
+ * called after another FullyConnected Layer. Data type supported: Same as @p input.
* @param[in] fc_info (Optional) Fully connected layer additional info
*/
void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
ICLTensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref
- * CLFullyConnectedLayerEx
+ * CLFullyConnectedLayer
*
* @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32.
* @param[in] weights Weights tensor info. The weights must be 2 dimensional.
* If this function is called after a Convolution Layer, the (transposed)
- * weights will have as many rows as the product of the first 3 input's dimensions.
- * If it is called after another FullyConnected Layer, the (transposed)
- * weights will have as many rows as the input's first dimension.
- * Data type supported: Same as @p input.
+ * weights will have as many rows as the product of the first 3 input's dimensions. If it is
+ * called after another FullyConnected Layer, the (transposed) weights will have as many rows as
+ * the input's first dimension. Data type supported: Same as @p input.
* @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
* @param[out] output Destination tensor info. Its shape should be equal to the output of a
* matrix multiplication between:
* - The output of im2col on the input and the (transposed) 2D weights, if the
* function is called after a Convolution Layer
* - The input tensor and the (transposed) 2D weights, if the function is
- * called after another FullyConnected Layer.
- * Data type supported: Same as @p input.
+ * called after another FullyConnected Layer. Data type supported: Same as @p input.
* @param[in] fc_info (Optional) Fully connected layer additional info
*
* @return a status
@@ -216,7 +207,7 @@ private:
CLConvertFullyConnectedWeights _convert_weights;
weights_transformations::CLConvertFullyConnectedWeightsManaged _convert_weights_managed;
weights_transformations::CLFullyConnectedLayerReshapeWeightsExManaged
- _reshape_weights_managed_function;
+ _reshape_weights_managed_function;
CLFlattenLayer _flatten_layer;
CLFullyConnectedLayerReshapeWeightsEx _reshape_weights_function;
CLGEMM _mm_gemm;
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h
index 289ab167f..bdb168664 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h
@@ -43,8 +43,8 @@ public:
public:
CLFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
- : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{},
- _memory_manager{memory_manager}, _cl_fc{nullptr}, _cl_reshape{}, _needs_reshape(false)
+ : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{},
+ _memory_manager{memory_manager}, _cl_fc{nullptr}, _cl_reshape{}, _needs_reshape(false)
{
// DO NOTHING
}
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
index b01ec4255..385eb0b2c 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
@@ -47,11 +47,14 @@
#ifndef __ARM_COMPUTE_CLGATHEREX_H__
#define __ARM_COMPUTE_CLGATHEREX_H__
+#include "arm_compute/core/Error.h"
#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
namespace arm_compute
{
+class CLCompileContext;
class ICLTensor;
+class ITensorInfo;
/**
* @brief Class to to run @ref CLGatherKernel.
@@ -66,7 +69,7 @@ public:
* @param[out] output The output tensor, Data types supported: same as @p input.
* @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0
* @return N/A
- */
+ */
void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
/**
@@ -81,5 +84,5 @@ public:
static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
const ITensorInfo *output, int axis = 0);
};
-}
+} // namespace arm_compute
#endif /*__ARM_COMPUTE_CLGATHEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
index 6618f5aa4..5e172a4c7 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
@@ -78,5 +78,5 @@ public:
void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput,
ICLTensor *output, ICLTensor *hits);
};
-}
+} // namespace arm_compute
#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
index 887e7aaa5..02ae6d719 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
@@ -41,11 +41,14 @@
#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__
#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__
+#include "arm_compute/core/Error.h"
#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
namespace arm_compute
{
+class CLCompileContext;
class ICLTensor;
+class ITensorInfo;
/** Basic function to perform a Instance normalization.
*
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h
new file mode 100644
index 000000000..62a36f06d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLONEHOT_H__
+#define __ARM_COMPUTE_CLONEHOT_H__
+
+#include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+/** Basic function to run @ref CLOneHotKernel */
+class CLOneHot : public IFunction
+{
+public:
+ /** Constructor */
+ CLOneHot();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLOneHot(const CLOneHot &) = delete;
+ /** Default move constructor */
+ CLOneHot(CLOneHot &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLOneHot &operator=(const CLOneHot &) = delete;
+ /** Default move assignment operator */
+ CLOneHot &operator=(CLOneHot &&) = default;
+ /** Initialise the kernel's inputs and outputs
+ *
+ * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+ * following types: U32/S32
+ * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported:
+ * U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
+ * Same as @p on_value
+ * @param[out] output Destination tensor. Data type supported: Same as @p on_value
+ * @param[in] depth The depth of the one hot dimension.
+ * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+ * value must be in range [-indices.rank , indices.rank)
+ */
+ void configure(const ICLTensor *indices, const ICLTensor *on_value, const ICLTensor *off_value,
+ ICLTensor *output, int depth, int axis = -1);
+ /** Initialise the kernel's inputs and outputs with off_value being constant
+ *
+ * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+ * following types: U32/S32
+ * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported:
+ * U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[out] output Destination tensor. Data type supported: Same as @p on_value
+ * @param[in] off_value The PixelValue for off value. Data type supported: Same as @p on_value
+ * @param[in] depth The depth of the one hot dimension.
+ * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+ * value must be in range [-indices.rank , indices.rank)
+ */
+ void configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output,
+ PixelValue off_value, int depth, int axis = -1);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLOneHotKernel
+ *
+ * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+ * following types: U32/S32
+ * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported:
+ * U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
+ * Same as @p on_value
+ * @param[in] output Destination tensor. Data type supported: Same as @p on_value
+ * @param[in] depth The depth of the one hot dimension.
+ * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+ * value must be in range [-indices.rank , indices.rank)
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+ const ITensorInfo *off_value, const ITensorInfo *output, int depth,
+ int axis = -1);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ CLMemsetKernel _memset_kernel; /**< Memset kernel */
+ CLOneHotKernel _onehot_kernel; /**< OneHot kernel */
+ bool _has_to_memset;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLONEHOT_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
new file mode 100644
index 000000000..ee1879aaa
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLPADLAYEREX_H
+#define ARM_COMPUTE_CLPADLAYEREX_H
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h"
+#include "src/core/gpu/cl/kernels/ClCopyKernel.h"
+// #include "arm_compute/runtime/CL/functions/CLCopy.h"
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to pad a tensor. This function calls the following OpenCL functions/kernels:
+ *
+ * -# @ref CLPadLayerKernelEx if there is padding to be added
+ * -# @ref CLCopyKernel otherwise
+ */
+class CLPadLayerEx : public IFunction
+{
+public:
+ /** Default constructor */
+ CLPadLayerEx();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLPadLayerEx(const CLPadLayerEx &) = delete;
+ /** Default move constructor */
+ CLPadLayerEx(CLPadLayerEx &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLPadLayerEx &operator=(const CLPadLayerEx &) = delete;
+ /** Default move assignment operator */
+ CLPadLayerEx &operator=(CLPadLayerEx &&) = default;
+
+ /** Initialize the function
+ *
+ * @param[in] input Source tensor. Data types supported: All.
+ * @param[out] output Output tensor. Data type supported: same as @p input
+ * @param[in] padding The padding for each spatial dimension of the input tensor. The pair
+ * padding[i] specifies the front and the end padding in the i-th dimension.
+ * @param[in] constant_value (Optional) Constant value to be used for the padding.
+ * @param[in] mode (Optional) Controls whether the padding should be filled with @p
+ * constant_value using CONSTANT, or reflect the input, either including the border values
+ * (SYMMETRIC) or not (REFLECT).
+ */
+ void configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding,
+ PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
+ /** Initialize the function
+ *
+ * @param[in] compile_context The compile context to be used.
+ * @param[in] input Source tensor. Data types supported: All.
+ * @param[out] output Output tensor. Data type supported: same as @p input
+ * @param[in] padding The padding for each spatial dimension of the input tensor. The
+ * pair padding[i] specifies the front and the end padding in the i-th dimension.
+ * @param[in] constant_value (Optional) Constant value to be used for the padding.
+ * @param[in] mode (Optional) Controls whether the padding should be filled with @p
+ * constant_value using CONSTANT, or reflect the input, either including the border values
+ * (SYMMETRIC) or not (REFLECT).
+ */
+ void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output,
+ const PaddingList &padding, PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
+
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLPadLayerEx.
+ *
+ * @param[in] input Source tensor info. Data types supported: All.
+ * @param[in] output Output tensor info. Data type supported: same as @p input
+ * @param[in] padding The padding for each spatial dimension of the input tensor. The pair
+ * padding[i] specifies the front and the end padding in the i-th dimension.
+ * @param[in] constant_value (Optional) Constant value to be used for the padding
+ * @param[in] mode (Optional) Controls whether the padding should be filled with @p
+ * constant_value using CONSTANT, or reflect the input, either including the border values
+ * (SYMMETRIC) or not (REFLECT).
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+ const PaddingList &padding, PixelValue constant_value = PixelValue(),
+ PaddingMode mode = PaddingMode::CONSTANT);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ void configure_reflect_mode(ICLTensor *input, ICLTensor *output);
+
+ std::unique_ptr<CLPadLayerKernelEx> _pad_kernel;
+ std::unique_ptr<opencl::kernels::ClCopyKernel> _copy_kernel;
+ bool _perform_pad;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLPADLAYEREX_H */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
index 7dba84b12..45eb72bef 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
@@ -48,7 +48,7 @@
#define __ARM_COMPUTE_CLREDUCEOPERATION_H__
#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
-#include "arm_compute/core/TypesEx.h"
+#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/CLTensorAllocator.h"
@@ -82,7 +82,7 @@ public:
* @return N/A
*/
void configure(ICLTensor *input, ICLTensor *output, const std::set<uint32_t> &axis,
- bool keep_dims, ReduceOperation op);
+ bool keep_dims, ReductionOperation op);
/**
* @brief Static function to check if given info will lead to a valid configuration of @ref
@@ -96,7 +96,8 @@ public:
* @return a status
*/
static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const std::set<uint32_t> &axis, bool keep_dims, const ReduceOperation &op);
+ const std::set<uint32_t> &axis, bool keep_dims,
+ const ReductionOperation &op);
/**
* @brief Run the OpenCL kernel for this operation
@@ -115,5 +116,5 @@ private:
std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr};
CLReshapeLayer _reshape;
};
-}
+} // namespace arm_compute
#endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h
new file mode 100644
index 000000000..3023df3f0
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSPLITVEX__
+#define __ARM_COMPUTE_CLSPLITVEX__
+
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/CL/functions/CLSlice.h"
+#include "arm_compute/core/Types.h"
+#include <vector>
+#include <memory>
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/runtime/CPP/functions/CPPSplit.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLSplitVKernel */
+class CLSplitVEx : public IFunction
+{
+public:
+ /** Default constructor */
+ CLSplitVEx();
+ /** Configure the split CL kernel
+ *
+ * @param[in] input The input tensor to split. Data types supported:
+ * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] size_splits A 1-D tensor containing the number of tensor values per split
+ * @param[out] outputs A vector containing the output tensor. Data types supported: Same as @p
+ * input
+ * The output tensors should match the input tensor dimensions for all
+ * shape dimensions apart
+ * from the split dimension.
+ * @param[in] split_dim Integer value representing the input tensor dimension along which to
+ * split
+ * @param[in] num_splits Number of splits
+ */
+ void configure(const ICLTensor *input, const ICLTensor *size_splits, uint32_t split_dim,
+ const std::vector<ICLTensor *> &outputs, unsigned int num_splits);
+
+ void run() override;
+
+private:
+ const ICLTensor *_input;
+ const ICLTensor *_size_splits;
+ std::vector<ICLTensor *> _outputs;
+ unsigned int _num_splits;
+ std::vector<CLSlice> _slice_functions;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPLITVEX__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
index e301a5152..f426a4d75 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
@@ -160,5 +160,5 @@ private:
CLTopKV2Store _store_kernel;
#endif
};
-}
+} // namespace arm_compute
#endif // __ARM_COMPUTE_CLTOPK_V2_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
index 5fb102e47..5b27d362a 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
@@ -63,20 +63,22 @@ public:
/** Set the input, weights, biases and output tensors.
*
- * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
- * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
- * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same
- * as @p input.
- * @param[out] output Output tensor. The output has the same number of dimensions as the
- * @p input.
- * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this
- * is described in @ref PadStrideInfo.
- * @param[in] invalid_right The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
- * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
- * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input,
+ * and an optional 4th dimension for batch of inputs.
+ * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM].
+ * Data type supported: Same as @p input.
+ * @param[in] bias (Optional) The biases have one dimension.
+ * Data type supported: Same as @p input.
+ * @param[out] output Output tensor. The output has the same number of dimensions
+ * as the @p input.
+ * @param[in] deconv_info Contains padding and policies to be used in the deconvolution,
+ * this is described in @ref PadStrideInfo.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
+ * @param[in] weights_info (Optional) Weights information needed for
+ * @ref CLConvolutionLayer, specifies if the weights tensor has
+ * been reshaped with @ref CLWeightsReshapeKernel.
*
*/
void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
@@ -85,22 +87,22 @@ public:
/** Set the input, weights, biases and output tensors.
*
* @param[in] compile_context The compile context to be used.
- * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and
- * an optional 4th dimension for batch of inputs. Data types supported:
- * QASYMM8_SIGNED/QASYMM8/F16/F32.
- * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data
- * type supported: Same as @p input.
- * @param[in] bias (Optional) The biases have one dimension. Data type supported:
- * Same as @p input.
- * @param[out] output Output tensor. The output has the same number of dimensions as
- * the @p input.
+ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input,
+ * and an optional 4th dimension for batch of inputs.
+ * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM].
+ * Data type supported: Same as @p input.
+ * @param[in] bias (Optional) The biases have one dimension.
+ * Data type supported: Same as @p input.
+ * @param[out] output Output tensor. The output has the same number of dimensions
+ * as the @p input.
* @param[in] deconv_info Contains padding and policies to be used in the deconvolution,
- * this is described in @ref PadStrideInfo.
- * @param[in] invalid_right The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
- * @param[in] weights_info (Optional) Weights information needed for @ref
- * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
- * CLWeightsReshapeKernel.
+ * this is described in @ref PadStrideInfo.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
+ * @param[in] weights_info (Optional) Weights information needed for
+ * @ref CLConvolutionLayer, specifies if the weights tensor has
+ * been reshaped with @ref CLWeightsReshapeKernel.
*
*/
void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
@@ -108,22 +110,24 @@ public:
unsigned int invalid_right, unsigned int invalid_bottom,
const WeightsInfo &weights_info = WeightsInfo());
/** Static function to check if given info will lead to a valid configuration of @ref
- * CLTransposeConvLayer
+ * CLTransposeConvLayer
*
- * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
- * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data
- * type supported: Same as @p input.
- * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same as
- * @p input.
- * @param[in] output Output tensor info. The output has the same number of dimensions as the
- * @p input.
- * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is
- * described in @ref PadStrideInfo.
- * @param[in] invalid_right The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
- * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
- * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+ * @param[in] input Input tensor info. 3 lower dimensions represent a single input,
+ * and an optional 4th dimension for batch of inputs.
+ * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+ * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM].
+ * Data type supported: Same as @p input.
+ * @param[in] bias (Optional) The biases have one dimension.
+ * Data type supported: Same as @p input.
+ * @param[in] output Output tensor info. The output has the same number of dimensions
+ * as the @p input.
+ * @param[in] deconv_info Contains padding and policies to be used in the deconvolution,
+ * this is described in @ref PadStrideInfo.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
+ * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with
+ * @ref CLWeightsReshapeKernel.
*
* @return a status
*/
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
index 3fad230f1..d0ddc2609 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
@@ -16,13 +16,13 @@
#ifndef __ARM_COMPUTE_NEFUNCTIONSEX_H__
#define __ARM_COMPUTE_NEFUNCTIONSEX_H__
-#include <arm_compute/runtime/NEON/functions/NEActivationLayerEx.h>
-#include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h>
+#include <arm_compute/runtime/NEON/functions/NECastBool.h>
#include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h>
#include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
#include <arm_compute/runtime/NEON/functions/NEGatherEx.h>
#include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h>
#include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NEOneHot.h>
#include <arm_compute/runtime/NEON/functions/NEReduceSum.h>
#include <arm_compute/runtime/NEON/functions/NEReduceOperation.h>
#include <arm_compute/runtime/NEON/functions/NETransposeConvLayer.h>
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEActivationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEActivationLayerEx.h
deleted file mode 100644
index 6156c84f8..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEActivationLayerEx.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEACTIVATIONLAYEREX_H__
-#define __ARM_COMPUTE_NEACTIVATIONLAYEREX_H__
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Basic function to run @ref NEActivationLayerKernelEx
- *
- * @note The function simulates an activation layer with the specified activation function.
- */
-class NEActivationLayerEx : public INESimpleFunctionNoBorder
-{
-public:
- /** Constructor
- *
- * @param[in] ctx Runtime context to be used by the function
- */
- NEActivationLayerEx(IRuntimeContext *ctx = nullptr);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEActivationLayerEx(const NEActivationLayerEx &) = delete;
- /** Default move constructor */
- NEActivationLayerEx(NEActivationLayerEx &&) = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEActivationLayerEx &operator=(const NEActivationLayerEx &) = delete;
- /** Default move assignment operator */
- NEActivationLayerEx &operator=(NEActivationLayerEx &&) = default;
- /** [NEActivationLayerEx snippet] **/
- /** Set the input and output tensor.
- *
- * @note If the output tensor is a nullptr or is equal to the input, the activation function will
- * be performed in-place
- *
- * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this
- * tensor will store the result
- * of the activation function. Data types supported:
- * QASYMM8/QSYMM16/F16/F32.
- * @param[out] output Destination tensor. Data type supported: same as @p input
- * @param[in] activation_info Activation layer parameters.
- */
- void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info);
- /** [NEActivationLayerEx snippet] **/
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NEActivationLayerEx
- *
- * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor
- * will store the result
- * of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32.
- * @param[in] output Destination tensor info. Data type supported: same as @p input
- * @param[in] act_info Activation layer information.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const ActivationLayerInfo &act_info);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEACTIVATIONLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
deleted file mode 100644
index 026d30098..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
-#define __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
-
-#include "arm_compute/core/TypesEx.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEBinaryLogicalOperationKernel.
- *
- * @note The tensor data type for the inputs must be QASYMM8/U8.
- * @note The function performs a binary logical operation between two tensors.
- */
-class NEBinaryLogicalOperation : public INESimpleFunction
-{
-public:
- /** Initialise the kernel's inputs, output and conversion policy.
- *
- * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8.
- * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
- * @param[out] output Output tensor. Data types supported: Same as @p input1.
- * @param[in] op Binary Logical Operation to be performed.
- */
- void configure(ITensor *input1, ITensor *input2, ITensor *output, BinaryLogicalOperation op);
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NEBinaryLogicalOperationKernel
- *
- * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8.
- * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
- * @param[in] output Output tensor info. Data types supported: Same as @p input1.
- * @param[in] op Binary Logical Operation to be performed.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *output, BinaryLogicalOperation op);
-};
-
-/** Basic function to run @ref NEBinaryLogicalOperationKernel
- *
- * @note The tensor data type for the inputs must be QASYMM8/U8.
- * @note The function performs a binary logical operation between two tensors.
- */
-template <BinaryLogicalOperation op> class NEBinaryLogicalOperationStatic : public INESimpleFunction
-{
-public:
- /** Initialise the kernel's inputs, output and conversion policy.
- *
- * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8
- * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
- * @param[out] output Output tensor. Data types supported: Same as @p input1.
- */
- void configure(ITensor *input1, ITensor *input2, ITensor *output);
- /** Static function to check if given info will lead to a valid configuration of @ref
- * NEBinaryLogicalOperationKernel
- *
- * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8
- * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
- * @param[in] output Output tensor info. Data types supported: Same as @p input1.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *output);
-};
-
-/** Basic function to run equal comparison. */
-using NELogicalAnd = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>;
-/** Basic function to run not equal comparison. */
-using NELogicalOr = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>;
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h
new file mode 100644
index 000000000..dd62645ee
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECASTBOOL_H__
+#define __ARM_COMPUTE_NECASTBOOL_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+namespace arm_compute
+{
+class ITensor;
+class ITensorInfo;
+
+/**
+ * @brief Class to run @ref INESimpleFunctionNoBorder.
+ */
+class NECastBool : public INESimpleFunctionNoBorder
+{
+public:
+ /** Initialize the function's source, destination
+ *
+ * Valid conversions Input -> Output :
+ *
+ * - U8 -> U8, S8, U16, S16, U32, S32, F32, F16
+ *
+ * @param[in] input The input tensor to convert. Data types supported: U8
+ * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+ */
+ void configure(const ITensor *input, ITensor *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref NECastBool
+ *
+ * @param[in] input Source tensor info. Data types supported: U8.
+ * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NECASTBOOL_H__*/
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
index 63f7714aa..82a789e86 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
@@ -48,12 +48,14 @@
#define __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__
#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/core/Error.h"
#include <vector>
namespace arm_compute
{
class ITensor;
+class ITensorInfo;
/**
* @brief Class to perform EmbeddingLookup operation
@@ -84,5 +86,5 @@ public:
static Status validate(const ITensorInfo *input, const ITensorInfo *output,
const ITensorInfo *lookups);
};
-}
+} // namespace arm_compute
#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
index 56548a479..214592710 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
@@ -44,11 +44,11 @@
#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
#include "arm_compute/runtime/Tensor.h"
+#include "src/core/NEON/kernels/NETransposeKernel.h"
namespace arm_compute
{
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
index 8f98f220a..2bbb1fea1 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
@@ -43,16 +43,16 @@
#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
-#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+#include "src/core/NEON/kernels/NETransposeKernel.h"
namespace arm_compute
{
@@ -79,11 +79,11 @@ public:
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEFullyConnectedLayerEx(const NEFullyConnectedLayerEx &) = delete;
/** Default move constructor */
- NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = default;
+ NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = delete;
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEFullyConnectedLayerEx &operator=(const NEFullyConnectedLayerEx &) = delete;
/** Default move assignment operator */
- NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = default;
+ NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = delete;
/** Set the input and output tensors.
*
* @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32.
@@ -141,7 +141,7 @@ private:
void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
MemoryGroup _memory_group;
- NEFlattenLayerKernel _flatten_kernel;
+ NEFlattenLayer _flatten_kernel;
NEConvertFullyConnectedWeights _convert_weights;
NEFullyConnectedLayerReshapeWeights _reshape_weights_function;
NEGEMM _mm_gemm;
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
index 18cb61bf9..e34b4dcb0 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
@@ -43,8 +43,8 @@ public:
public:
NEFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
- : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr),
- _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false)
+ : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr),
+ _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false)
{
// DO NOTHING
}
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
index 155a1b837..6944c77f6 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
@@ -47,6 +47,7 @@
namespace arm_compute
{
class ITensor;
+class ITensorInfo;
/** Basic function to run @ref NEGatherKernelEx */
class NEGatherEx : public INESimpleFunctionNoBorder
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
index 521a05ad9..f6fda60a9 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
@@ -48,12 +48,14 @@
#define __ARM_COMPUTE_NEHASHTABLELOOKUP_H__
#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/core/Error.h"
#include <vector>
namespace arm_compute
{
class ITensor;
+class ITensorInfo;
/**
* @brief Class to perform HashtableLookup operation
@@ -96,5 +98,5 @@ public:
const ITensorInfo *input, const ITensorInfo *output,
const ITensorInfo *hits);
};
-}
+} // namespace arm_compute
#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
index 18e813923..0ee967698 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
@@ -54,6 +54,7 @@
namespace arm_compute
{
class ITensor;
+class ITensorInfo;
/** Basic function to perform a Instance normalization.
*
@@ -112,5 +113,5 @@ private:
Tensor _permuted_input;
Tensor _permuted_output;
};
-}
+} // namespace arm_compute
#endif /* __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
new file mode 100644
index 000000000..668f024a1
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEONEHOT_H__
+#define __ARM_COMPUTE_NEONEHOT_H__
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+class ITensorInfo;
+
+/** Basic function to run @ref NEOneHotKernel */
+class NEOneHot : public INESimpleFunctionNoBorder
+{
+public:
+ /** Initialise the kernel's inputs and outputs
+ *
+ * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+ * following types: U32/S32
+ * @param[in] depth The tensor for depth of the one hot dimension. Supported tensor rank: up
+ * to 3. Must be one of the following types: U32/S32
+ * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported:
+ * U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported:
+ * Same as @p on_value
+ * @param[out] output Destination tensor. Data type supported: Same as @p on_value
+ * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+ * The value must be in range [-indices.rank , indices.rank)
+ */
+ void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
+ const ITensor *off_value, ITensor *output, int axis = -1);
+ /** Static function to check if given info will lead to a valid configuration of @ref
+ * NEOneHotKernel
+ *
+ * @param[in] indices Indices tensor info. Supported tensor rank: up to 3.
+ * Must be one of the following types: U32/S32
+ * @param[in] depth The tensor info for depth of the one hot dimension.
+ * Supported tensor rank: up to 3.
+ * Must be one of the following types: U32/S32
+ * @param[in] on_value On value tensor info. Supported tensor rank: only 1.
+ * Data type supported: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] off_value Off value tensor info. Supported tensor rank: only 1.
+ * Data type supported: Same as @p on_value
+ * @param[out] output Destination tensor info. Data type supported: Same as @p on_value
+ * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+ * The value must be in range [-indices.rank , indices.rank)
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *indices, const ITensorInfo *depth,
+ const ITensorInfo *on_value, const ITensorInfo *off_value,
+ const ITensorInfo *output, int axis = -1);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEONEHOT_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
index 7f764b000..9858e6c09 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
@@ -43,10 +43,10 @@
#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/TypesEx.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
#include "arm_compute/runtime/Tensor.h"
@@ -71,7 +71,7 @@ public:
* @param[in] op Reduce operation to perform.
*/
void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output,
- ReduceOperation op);
+ ReductionOperation op);
/** Static function to check if given info will lead to a valid configuration of @ref
* NEReduceOperation
@@ -85,14 +85,14 @@ public:
* @return A status
*/
static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
- bool keep_dims, const ITensorInfo *output, ReduceOperation op);
+ bool keep_dims, const ITensorInfo *output, ReductionOperation op);
// Inherited methods overridden:
void run() override;
private:
MemoryGroup _memory_group;
- std::vector<NEReductionOperationEx> _reduction_kernels;
+ std::vector<NEReductionOperation> _reduction_kernels;
std::vector<Tensor> _reduced_outs;
NEReshapeLayer _reshape;
unsigned int _reduction_ops;
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
index 48b416923..f34a8f8af 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
@@ -43,11 +43,13 @@
#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+#include "arm_compute/runtime/Tensor.h"
namespace arm_compute
{
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
index 24ff5dac9..f82579a45 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
@@ -102,47 +102,50 @@ public:
/** Prevent instances of this class from being copied (As this class contains pointers) */
NETransposeConvLayer &operator=(const NETransposeConvLayer &) = delete;
/** Allow instances of this class to be moved */
- NETransposeConvLayer(NETransposeConvLayer &&) = default;
+ NETransposeConvLayer(NETransposeConvLayer &&) = delete;
/** Allow instances of this class to be moved */
- NETransposeConvLayer &operator=(NETransposeConvLayer &&) = default;
+ NETransposeConvLayer &operator=(NETransposeConvLayer &&) = delete;
/** Default destructor */
virtual ~NETransposeConvLayer() = default;
/** Set the input, weights, biases and output tensors.
*
- * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
- * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
- * @param[in] bias Optional, ignored if NULL. The biases have one dimension. Data type
- * supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16
- * for F16 input.
- * @param[out] output Output tensor. The output has the same number of dimensions as the @p
- * input.
- * @param[in] info Contains padding and policies to be used in the deconvolution, this is
- * decribed in @ref PadStrideInfo.
- * @param[in] invalid_right The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
+ * @param[in,out] input Input tensor. 3 lower dimensions represent a single input,
+ * and an optional 4th dimension for batch of inputs.
+ * Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+ * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM].
+ * Data type supported: Same as @p input.
+ * @param[in] bias Optional, ignored if NULL. The biases have one dimension.
+ * Data type supported: Data types supported: S32 for QASYMM8 and
+ * QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
+ * @param[out] output Output tensor. The output has the same number of dimensions as
+ * the @p input.
+ * @param[in] info Contains padding and policies to be used in the deconvolution,
+ * this is decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
*
*/
void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
const PadStrideInfo &info, unsigned int invalid_right,
unsigned int invalid_bottom);
/** Static function to check if given info will lead to a valid configuration of @ref
- * NETransposeConvLayer
+ * NETransposeConvLayer
*
- * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
- * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
- * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types
- * supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
- * @param[in] output Output tensor info. The output has the same number of dimensions as the @p
- * input.
- * @param[in] info Contains padding and policies to be used in the deconvolution, this is
- * decribed in @ref PadStrideInfo.
- * @param[in] innvalid_right The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
+ * @param[in] input Input tensor info. 3 lower dimensions represent a single input,
+ * and an optional 4th dimension for batch of inputs.
+ * Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+ * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM].
+ * Data type supported: Same as @p input.
+ * @param[in] bias (Optional) The biases have one dimension.
+ * Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input,
+ * F32 for F32 input, F16 for F16 input.
+ * @param[in] output Output tensor info. The output has the same number of dimensions as
+ * the @p input.
+ * @param[in] info Contains padding and policies to be used in the deconvolution,
+ * this is decribed in @ref PadStrideInfo.
+ * @param[in] innvalid_right The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom The number of zeros added to bottom edge of the output.
*
* @return a status
*/
@@ -168,5 +171,5 @@ private:
PadStrideInfo _info;
bool _is_prepared;
};
-} // arm_compute
+} // namespace arm_compute
#endif /* __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index ba42a2456..e15dc2685 100644
--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -54,104 +54,143 @@
using namespace arm_compute;
const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
- // ARMComputeEx kernels
- {"binary_logical_op", "binary_logical_op.cl"},
- {"embedding_lookup", "embedding_lookup.cl"},
- {"gather_ex", "gather_ex.cl"},
- {"gather_ex_1d", "gather_ex.cl"},
- {"gather_ex_1d_out", "gather_ex.cl"},
- {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"},
- {"hashtable_lookup", "hashtable_lookup.cl"},
- {"instance_normalization_ex", "instance_normalization_ex.cl"},
- {"multiply_scale_factor", "multiply_scale_factor.cl"},
- {"neg_tensor", "neg_tensor.cl"},
- {"quantization_symm8", "quantization_symm8.cl"},
- {"reduce_min_max", "reduce_operation.cl"},
- {"reduce_sum_mean", "reduce_operation.cl"},
- {"topkv2_init", "topkv2.cl"},
- {"topkv2_find_first_negative", "topkv2.cl"},
- {"topkv2_reorder_negatives", "topkv2.cl"},
- {"topkv2_store", "topkv2.cl"},
- {"radixsort_histogram", "topkv2_radixsort.cl"},
- {"radixsort_scanhistograms", "topkv2_radixsort.cl"},
- {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
- {"radixsort_reorder", "topkv2_radixsort.cl"},
- {"topkv2_quicksort", "topkv2_quicksort.cl"},
- {"scale_factor_symm8", "scale_factor.cl"},
+ // ARMComputeEx kernels
+ {"arg_min_max_ex_x", "arg_min_max_ex.cl"},
+ {"arg_min_max_ex_y", "arg_min_max_ex.cl"},
+ {"arg_min_max_ex_z", "arg_min_max_ex.cl"},
+ {"arg_min_max_ex_w", "arg_min_max_ex.cl"},
+ {"binary_logical_op", "binary_logical_op.cl"},
+ {"cast_bool", "cast.cl"},
+ {"embedding_lookup", "embedding_lookup.cl"},
+ {"gather_ex", "gather_ex.cl"},
+ {"gather_ex_1d", "gather_ex.cl"},
+ {"gather_ex_1d_out", "gather_ex.cl"},
+ {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"},
+ {"gemm_accumulate_biases", "gemm.cl"},
+ {"hashtable_lookup", "hashtable_lookup.cl"},
+ {"instance_normalization_ex", "instance_normalization_ex.cl"},
+ {"memset", "memset.cl"},
+ {"multiply_scale_factor", "multiply_scale_factor.cl"},
+ {"neg_tensor", "neg_tensor.cl"},
+ {"one_hot", "one_hot.cl"},
+ {"one_hot_only_on_value", "one_hot.cl"},
+ {"pad_layer_constant", "pad_layer.cl"},
+ {"pad_layer_symmetric_reflect", "pad_layer.cl"},
+ {"quantization_symm8", "quantization_symm8.cl"},
+ {"reduce_min_max", "reduce_operation.cl"},
+ {"reduce_sum_mean", "reduce_operation.cl"},
+ {"topkv2_init", "topkv2.cl"},
+ {"topkv2_find_first_negative", "topkv2.cl"},
+ {"topkv2_reorder_negatives", "topkv2.cl"},
+ {"topkv2_store", "topkv2.cl"},
+ {"topkv2_quicksort", "topkv2_quicksort.cl"},
+ {"scale_factor_symm8", "scale_factor.cl"},
};
const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
#ifdef EMBEDDED_KERNELS
- {
- "embedding_lookup.cl",
+ {
+ "activation_float_helpers.h",
+#include "./cl_kernels/activation_float_helpers.hembed"
+ },
+ {
+ "arg_min_max_ex.cl",
+#include "./cl_kernels/arg_min_max_ex.clembed"
+ },
+ {
+ "binary_logical_op.cl",
+#include "./cl_kernels/binary_logical_op.clembed"
+ },
+ {
+ "cast.cl",
+#include "./cl_kernels/cast.clembed"
+ },
+ {
+ "embedding_lookup.cl",
#include "./cl_kernels/embedding_lookup.clembed"
- },
- {
- "gather_ex.cl",
+ },
+ {
+ "gather_ex.cl",
#include "./cl_kernels/gather_ex.clembed"
- },
- {
- "gemmlowp_ex.cl",
+ },
+ {
+ "gemmlowp_ex.cl",
#include "./cl_kernels/gemmlowp_ex.clembed"
- },
- {
- "hashtable_lookup.cl",
+ },
+ {
+ "gemm_helpers.h",
+#include "./cl_kernels/gemm_helpers.hembed"
+ },
+ {
+ "hashtable_lookup.cl",
#include "./cl_kernels/hashtable_lookup.clembed"
- },
- {
- "helpers.h",
+ },
+ {
+ "helpers.h",
#include "./cl_kernels/helpers.hembed"
- },
- {
- "helpers_asymm.h",
+ },
+ {
+ "helpers_asymm.h",
#include "./cl_kernels/helpers_asymm.hembed"
- },
- {
- "instance_normalization_ex.cl",
+ },
+ {
+ "instance_normalization_ex.cl",
#include "./cl_kernels/instance_normalization_ex.clembed"
- },
- {
- "binary_logical_op.cl",
-#include "./cl_kernels/binary_logical_op.clembed"
- },
- {
- "multiply_scale_factor.cl",
+ },
+ {
+ "gemm.cl",
+#include "./cl_kernels/gemm.clembed"
+ },
+ {
+ "memset.cl",
+#include "./cl_kernels/memset.clembed"
+ },
+ {
+ "multiply_scale_factor.cl",
#include "./cl_kernels/multiply_scale_factor.clembed"
- },
- {
- "neg_tensor.cl",
+ },
+ {
+ "neg_tensor.cl",
#include "./cl_kernels/neg_tensor.clembed"
- },
- {
- "quantization_symm8.cl",
+ },
+ {
+ "one_hot.cl",
+#include "./cl_kernels/one_hot.clembed"
+ },
+ {
+ "pad_layer.cl",
+#include "./cl_kernels/pad_layer.clembed"
+ },
+ {
+ "quantization_symm8.cl",
#include "./cl_kernels/quantization_symm8.clembed"
- },
- {
- "reduce_operation.cl",
+ },
+ {
+ "reduce_operation.cl",
#include "./cl_kernels/reduce_operation.clembed"
- },
- {
- "scale_factor.cl",
+ },
+ {
+ "repeat.h",
+#include "./cl_kernels/repeat.hembed"
+ },
+ {
+ "scale_factor.cl",
#include "./cl_kernels/scale_factor.clembed"
- },
- {
- "topkv2.cl",
+ },
+ {
+ "topkv2.cl",
#include "./cl_kernels/topkv2.clembed"
- },
- {
- "topkv2_radixsort.cl",
-#include "./cl_kernels/topkv2_radixsort.clembed"
- },
- {
- "topkv2_quicksort.cl",
+ },
+ {
+ "topkv2_quicksort.cl",
#include "./cl_kernels/topkv2_quicksort.clembed"
- },
+ },
#endif /* EMBEDDED_KERNELS */
};
CLKernelLibraryEx::CLKernelLibraryEx()
- : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
+ : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
{
opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the
// CLKernelLibraryEx is built
@@ -318,8 +357,8 @@ size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) con
size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result);
ARM_COMPUTE_ERROR_ON_MSG(
- err != 0,
- "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
+ err != 0,
+ "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
ARM_COMPUTE_UNUSED(err);
return result;
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h
new file mode 100644
index 000000000..3c3ff8419
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#if GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) (fma(c, b, a))
+#else // GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) ((b) * (c) + (a))
+#endif // GPU_ARCH == GPU_ARCH_BIFROST
+
+// Hard-Swish
+#define hard_swish_op(DATA_TYPE, x, A_VAL, B_VAL) \
+ (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
+
+// Logistic Activation
+#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
+
+// Hyperbolic Tangent Activation
+#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
+
+// RELU Tangent Activation
+#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
+
+// Bounded RELU Activation
+#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
+
+// Lower Upper Bounded RELU Activation
+#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+
+// Leaky RELU Activation
+#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) \
+ ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+
+// Soft RELU Activation
+#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
+
+// ELU Activation
+#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) \
+ (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0)))
+
+// Absolute Activation
+#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x))
+
+// Square Activation
+#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x)
+
+// Square-root Activation
+#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x))
+
+// Linear Activation
+#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
+
+// Identity Activation
+#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x)
+
+#define ACT_OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL)
+
+#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, x, A_VAL, B_VAL)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl
new file mode 100644
index 000000000..135cacf59
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl
@@ -0,0 +1,564 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(FLOAT_DATA_TYPE)
+#define ISGREATER(x, y) isgreater(x, y)
+#define ISLESS(x, y) isless(x, y)
+#else // !FLOAT_DATA_TYPE
+#if defined(WIDTH)
+#define ISGREATER(x, y) (x > y) ? 1 : 0
+#define ISLESS(x, y) (x < y) ? 1 : 0
+#else // !defined(WIDTH)
+#define ISGREATER(x, y) \
+ select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x > y)
+#define ISLESS(x, y) \
+ select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x < y)
+#endif // defined(WIDTH)
+#endif // defined(FLOAT_DATA_TYPE)
+
+#if defined(ARG_MAX)
+#define CONDITION_TO_USE(x, y) ISGREATER(x, y)
+#elif defined(ARG_MIN)
+#define CONDITION_TO_USE(x, y) ISLESS(x, y)
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
+#error "Unsupported reduction operation!"
+#endif // defined(ARG_MAX)
+
+#if defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT)
+#if defined(WIDTH)
+#if defined(ARG_MIN)
+#if defined(PREV_OUTPUT)
+/** Find index minimum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global const DATA_TYPE *input,
+ __global const DATA_TYPE_OUTPUT *prev_res,
+ const int x_idx)
+{
+ int end_elem = (x_idx + 1) * 16;
+ if (end_elem > WIDTH)
+ {
+ end_elem = WIDTH - x_idx * 16;
+ }
+ DATA_TYPE_OUTPUT res = prev_res[0];
+ for (int x_v = 1; x_v < end_elem; ++x_v)
+ {
+ res = select(res, prev_res[x_v], *(input + prev_res[x_v]) < *(input + res));
+ }
+ return res;
+}
+#else // !defined(PREV_OUTPUT)
+/** Find index minimum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx)
+{
+#if WIDTH < 16
+ DATA_TYPE_OUTPUT res = 0;
+ for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
+ {
+ res = select(res, x_v, *(input + x_v) < *(input + res));
+ }
+ return res;
+#else // WIDTH >= 16
+ int x_elem = x_idx * 16;
+ const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
+ x_elem -= x_goback;
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ in = vload16(0, input - x_goback);
+ VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+ res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+ VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
+ idx_sel = (in.s01234567 <= in.s89abcdef);
+ in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
+ res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
+
+ idx_sel.s0123 =
+ (in.s0123 < in.s4567) ||
+ (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
+ in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
+ res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
+
+ idx_sel.s01 =
+ (in.s01 < in.s23) ||
+ (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
+ in.s01 = select(in.s23, in.s01, idx_sel.s01);
+ res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
+
+ idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
+ res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
+
+ return res.s0 + x_elem;
+#endif // WIDTH < 16
+}
+#endif // defined(PREV_OUTPUT)
+#endif // defined(ARG_MIN)
+#if defined(ARG_MAX)
+#if defined(PREV_OUTPUT)
+/** Find index maximum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global const DATA_TYPE *input,
+ __global const DATA_TYPE_OUTPUT *prev_res,
+ const int x_idx)
+{
+ int end_elem = (x_idx + 1) * 16;
+ if (end_elem > WIDTH)
+ {
+ end_elem = WIDTH - x_idx * 16;
+ }
+ DATA_TYPE_OUTPUT res = prev_res[0];
+ unsigned int res_int = res;
+ DATA_TYPE_OUTPUT condition_check2;
+ for (int x_v = 1; x_v < end_elem; ++x_v)
+ {
+ int i1 = prev_res[x_v];
+ condition_check2 = *(input + i1) > *(input + res_int);
+ res = select(res, prev_res[x_v], condition_check2);
+ }
+ return res;
+}
+#else // !defined(PREV_OUTPUT)
+/** Find index maximum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx)
+{
+#if WIDTH < 16
+ DATA_TYPE_OUTPUT res = 0;
+ unsigned int i1;
+ unsigned int i2;
+ DATA_TYPE_OUTPUT condition_check;
+ for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
+ {
+ i1 = x_v;
+ i2 = res;
+ condition_check = *(input + i1) > *(input + i2);
+ res = select(res, x_v, condition_check);
+ }
+ return res;
+#else // WIDTH >= 16
+ int x_elem = x_idx * 16;
+ const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
+ x_elem -= x_goback;
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ in = vload16(0, input - x_goback);
+ VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+ res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+ VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
+ idx_sel = (in.s01234567 >= in.s89abcdef);
+ in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
+ res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
+
+ idx_sel.s0123 =
+ (in.s0123 > in.s4567) ||
+ (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
+ in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
+ res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
+
+ idx_sel.s01 =
+ (in.s01 > in.s23) ||
+ (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
+ in.s01 = select(in.s23, in.s01, idx_sel.s01);
+ res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
+
+ idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
+ res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
+
+ return res.s0 + x_elem;
+#endif // WIDTH < 16
+}
+#endif // defined(PREV_OUTPUT)
+#endif // defined(ARG_MAX)
+
+/** This kernel performs parallel reduction given an operation on x-axis.
+ *
+ * @note In case the results of previous stages are passed the flag PREV_OUTPUT has to be passed
+ * using -DPREV_OUTPUT
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g.
+ * -DDATA_TYPE_OUTPUT=uint
+ * @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the
+ * ArgMax
+ * @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the
+ * ArgMin
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data
+ * types: S32/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension
+ * (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the
+ * source tensor
+ * @param[in] prev_res_ptr (Optional) Pointer to previous results
+ * tensor. Supported data types: U32/S32
+ * @param[in] prev_res_stride_x (Optional) Stride of the output tensor in X
+ * dimension (in bytes)
+ * @param[in] prev_res_step_x (Optional) prev_res_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in] prev_res_stride_y (Optional) Stride of the output tensor in Y
+ * dimension (in bytes)
+ * @param[in] prev_res_step_y (Optional) prev_res_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in] prev_res_offset_first_element_in_bytes (Optional) The offset of the first element
+ * in the previous results tensor
+ * @param[in] partial_res_ptr The local buffer to hold partial result
+ * values. Supported data types: U32/S32
+ * @param[in] partial_res_stride_x Stride of the output tensor in X dimension
+ * (in bytes)
+ * @param[in] partial_res_step_x partial_res_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] partial_res_stride_y Stride of the output tensor in Y dimension
+ * (in bytes)
+ * @param[in] partial_res_step_y partial_res_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the
+ * source tensor
+ * @param[in] local_results Local buffer for storing the partial result
+ */
+__kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src),
+#if defined(PREV_OUTPUT)
+ IMAGE_DECLARATION(prev_res),
+#endif // defined(PREV_OUTPUT)
+ IMAGE_DECLARATION(partial_res),
+ __local DATA_TYPE_OUTPUT *local_results)
+{
+#if defined(PREV_OUTPUT)
+ Image src = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
+ Image prev_res = CONVERT_TO_IMAGE_STRUCT(prev_res);
+#else // !defined(PREV_OUTPUT)
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#endif // defined(PREV_OUTPUT)
+ Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
+
+ unsigned int lsize = get_local_size(0);
+ unsigned int lid = get_local_id(0);
+
+ const uint x_idx = get_global_id(0);
+ const uint y_idx = get_global_id(1);
+ const __global DATA_TYPE *src_in_row =
+ (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y);
+
+ for (unsigned int y = 0; y < get_local_size(1); ++y)
+ {
+#if defined(ARG_MAX)
+#if defined(PREV_OUTPUT)
+ local_results[lid] =
+ arg_idx_max_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
+#else // !defined(PREV_OUTPUT)
+ local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
+#endif // defined(PREV_OUTPUT)
+#else // defined(ARG_MIN)
+#if defined(PREV_OUTPUT)
+ local_results[lid] =
+ arg_idx_min_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
+#else // !defined(PREV_OUTPUT)
+ local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
+#endif // defined(PREV_OUTPUT)
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Looking for the next highest power of 2 (maximum value of lsize is 8)
+ unsigned int middle = lsize - 1;
+ middle |= middle >> 1;
+ middle |= middle >> 2;
+ middle += 1;
+ // Perform parallel reduction
+ DATA_TYPE_OUTPUT condition_check3;
+ for (unsigned int i = middle; i > 0; i >>= 1)
+ {
+ if (lid < i && lid + i < lsize)
+ {
+ DATA_TYPE tmp0 = *(src_in_row + local_results[lid]);
+ DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]);
+#if defined(ARG_MAX)
+ condition_check3 =
+ ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1);
+ local_results[lid] = select(local_results[lid], local_results[lid + i], condition_check3);
+#else // defined(ARG_MIN)
+ local_results[lid] = select(
+ local_results[lid], local_results[lid + i],
+ ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ if (lid == 0)
+ {
+ ((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
+ }
+ }
+}
+#endif // defined(WIDTH)
+
+#if defined(HEIGHT)
+/** This kernel performs reduction on y-axis.
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g.
+ * -DDATA_TYPE=float
+ * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g.
+ * -DDATA_TYPE_OUTPUT=uint
+ * @note The data type of the select results must be passed at compile time using
+ * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
+ * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data
+ * types: S32/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[in] output_ptr The local buffer to hold sumed values. Supported
+ * data types: U32/S32
+ * @param[in] output_stride_x Stride of the output tensor in X dimension (in
+ * bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the output tensor in Y dimension (in
+ * bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ */
+__kernel void arg_min_max_ex_y(IMAGE_DECLARATION(src), IMAGE_DECLARATION(output))
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Image output = CONVERT_TO_IMAGE_STRUCT(output);
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16));
+
+ VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+ indx = 0;
+ for (unsigned int y = 1; y < HEIGHT; ++y)
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ in =
+ CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16));
+
+ VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+ cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
+ indx = select(indx, y, cond_conv);
+ res = select(res, in, CONDITION_TO_USE(in, res));
+ }
+
+ // Store result
+ vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
+}
+#endif // defined(HEIGHT)
+
+#if defined(DEPTH)
+/** This kernel performs reduction on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the select results must be passed at compile time using
+ * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data
+ * types: S32/F16/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[in] output_ptr The local buffer to hold sumed values. Supported
+ * data types: U32/S32
+ * @param[in] output_stride_x Stride of the output tensor in X dimension (in
+ * bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the output tensor in Y dimension (in
+ * bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the output tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ */
+__kernel void arg_min_max_ex_z(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
+{
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)),
+ VEC_DATA_TYPE(DATA_TYPE, 16));
+
+ VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+ indx = 0;
+ for (DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z)
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)),
+ VEC_DATA_TYPE(DATA_TYPE, 16));
+
+ VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+ cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
+ indx = select(indx, z, cond_conv);
+ res = select(res, in, CONDITION_TO_USE(in, res));
+ }
+
+ // Store result
+ vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
+}
+#endif /* defined(DEPTH) */
+
+#if defined(BATCH) && defined(DEPTH)
+/** This kernel performs reduction on w-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the select results must be passed at compile time using
+ * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
+ * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
+ * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data
+ * types: S32/F16/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[in] output_ptr The local buffer to hold sumed values. Supported
+ * data types: U32/S32
+ * @param[in] output_stride_x Stride of the output tensor in X dimension (in
+ * bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the output tensor in Y dimension (in
+ * bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the output tensor in Z dimension (in
+ * bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the output tensor in W dimension (in
+ * bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ */
+__kernel void arg_min_max_ex_w(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
+{
+ Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
+ Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)),
+ VEC_DATA_TYPE(DATA_TYPE, 16));
+
+ VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+ indx = 0;
+ for (DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w)
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)),
+ VEC_DATA_TYPE(DATA_TYPE, 16));
+
+ VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+ cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
+ indx = select(indx, w, cond_conv);
+ res = select(res, in, CONDITION_TO_USE(in, res));
+ }
+
+ // Store result
+ vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
+}
+#endif /* defined(BATCH) && defined(DEPTH) */
+#endif /* defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) */
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
index e249663bc..f8b5bbeb8 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
@@ -111,14 +111,14 @@ __kernel void binary_logical_op(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATI
#if OP_CODE == 1 // LOGICAL AND
VSTORE(VEC_SIZE)
(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) &&
- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+ VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
0, (__global DATA_TYPE *)output.ptr);
#elif OP_CODE == 2 // LOGICAL OR
VSTORE(VEC_SIZE)
(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) ||
- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+ VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
0, (__global DATA_TYPE *)output.ptr);
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
new file mode 100644
index 000000000..3b0a175a4
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function performs a up-scaling depth conversion for boolean type input.
+ *
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and
+ * -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ * @note The integer shift amount value need to be passed at compile time using -DSHIFT:
+ * e.g. -DSHIFT=7
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types:
+ * U8
+ * @param[in] in_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] in_step_x in_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] in_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] in_step_y in_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] in_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] in_step_z in_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data
+ * types: U8/S8/U16/S16/U32/S32/F16/F32
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in
+ * bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in
+ * bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination
+ * image
+ */
+__kernel void cast_bool(TENSOR3D_DECLARATION(in), TENSOR3D_DECLARATION(out))
+{
+ // Get pixels pointer
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(in);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ // Load data
+ VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+ in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in.ptr);
+
+ VSTORE(VEC_SIZE)
+ (CONVERT(in_data & 1, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
+ (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
index 92e5dfbee..5ebc78d23 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
@@ -117,15 +117,15 @@ __kernel void embedding_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION
// lookup ids for based on the tensor dimensions
int lup_id[4] = {0};
- lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0)))
- : get_global_id(0);
- lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1)))
- : get_global_id(1);
+ lup_id[0] =
+ (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) : get_global_id(0);
+ lup_id[1] =
+ (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) : get_global_id(1);
lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2)))
: get_global_id(2) % DEPTH_OUT;
lup_id[3] = (NUM_DIMS == 4)
- ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
- : get_global_id(2) / DEPTH_OUT;
+ ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+ : get_global_id(2) / DEPTH_OUT;
in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x +
lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl
new file mode 100644
index 000000000..9b826a2bd
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl
@@ -0,0 +1,7210 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "repeat.h"
+
+#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
+#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
+#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
+#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
+#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+#define CONCAT_INC(K0) INC##K0
+#define INC(K0) CONCAT_INC(K0)
+
+#if (SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a) \
+ ({ \
+ a = select( \
+ 0, a, \
+ CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), \
+ VEC_DATA_TYPE(DATA_TYPE, K0))); \
+ })
+#else // (SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a) ({})
+#endif // (SRC_WIDTH % K0)
+
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks
+ * of size M0xK0 and stores each one (not transposed) in the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g.
+ * -DSRC_WIDTH=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g.
+ * -DM0=2, -DK0=2).
+ * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at
+ * compile time using -DV0 (e.g. -DV0=2)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ * M0: 2,3,4,5,6,7,8
+ * K0: 2,3,4,8,16
+ * V0: greater than 0
+ * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer
+ * 1x1), the following information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile
+ * time.
+ *
+ * @param[in] src_ptr Pointer to the source LHS tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in
+ * bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in
+ * bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS
+ * tensor
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_INPUT_AS_3D)
+ */
+__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+)
+{
+ // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+ // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+ // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+ // Compute source and destination addresses
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+ // ------------------ Compute input/output addresses ---------------------------
+
+ // Compute the input address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes +
+ x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
+
+ // Compute the output address
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+ (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) +
+ ((y / (uint)V0) * (uint)dst_stride_y) +
+ ((y % V0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
+
+ // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
+ REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply src_stride_z by DEPTH_GEMM3D
+
+ input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
+
+ // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ input_ptr += z * (uint)src_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ output_ptr += z * (uint)dst_stride_z;
+
+ // ---------------------------Load input values --------------------------------
+ // Load values from the LHS matrix
+ LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+ BOUNDARY_CONDITION_X(x, a0);
+#if M0 > 1
+ BOUNDARY_CONDITION_X(x, a1);
+#endif // M0 > 1
+#if M0 > 2
+ BOUNDARY_CONDITION_X(x, a2);
+#endif // M0 > 2
+#if M0 > 3
+ BOUNDARY_CONDITION_X(x, a3);
+#endif // M0 > 3
+#if M0 > 4
+ BOUNDARY_CONDITION_X(x, a4);
+#endif // M0 > 4
+#if M0 > 5
+ BOUNDARY_CONDITION_X(x, a5);
+#endif // M0 > 5
+#if M0 > 6
+ BOUNDARY_CONDITION_X(x, a6);
+#endif // M0 > 6
+#if M0 > 7
+ BOUNDARY_CONDITION_X(x, a7);
+#endif // M0 > 7
+ // ---------------------------Store output values ------------------------------
+ REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+ STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#if M0 == 2
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, M0) \
+ res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \
+ VSTORE(M0) \
+ (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+ })
+#elif M0 == 3 // M0 == 3
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, M0) \
+ res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \
+ VSTORE(M0) \
+ (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+ })
+#elif M0 == 4 // M0 == 4
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, M0) \
+ res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
+ VSTORE(M0) \
+ (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+ })
+#elif M0 == 5 // M0 == 5
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
+ DATA_TYPE res1 = a4.s##i; \
+ VSTORE(4) \
+ (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+ *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
+ })
+#elif M0 == 6 // M0 == 6
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
+ VEC_DATA_TYPE(DATA_TYPE, 2) \
+ res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \
+ VSTORE(4) \
+ (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+ VSTORE(2) \
+ (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
+ })
+#elif M0 == 7 // M0 == 7
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, 4) \
+ res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \
+ VEC_DATA_TYPE(DATA_TYPE, 3) \
+ res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \
+ VSTORE(4) \
+ (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+ VSTORE(3) \
+ (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
+ })
+#elif M0 == 8 // M0 == 8
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, M0) \
+ res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, \
+ a6.s##i, a7.s##i); \
+ VSTORE(M0) \
+ (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+ })
+#else // M0 not supported
+#error "M0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks
+ * of size M0xK0 and stores each one (transposed) in the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g.
+ * -DSRC_WIDTH=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g.
+ * -DM0=2, -DK0=2).
+ * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at
+ * compile time using -DV0 (e.g. -DV0=2)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ * M0: 2,3,4,5,6,7,8
+ * K0: 2,3,4,8,16
+ * V0: greater than 0
+ * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer
+ * 1x1), the following information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile
+ * time.
+ *
+ * @param[in] src_ptr Pointer to the source LHS tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in
+ * bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in
+ * bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS
+ * tensor
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_INPUT_AS_3D)
+ */
+__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+)
+{
+ // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+ // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (M0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+ // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (M0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (M0)
+#endif // defined(INTERLEAVE)
+
+ // Compute source and destination addresses
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+ // ------------------ Compute input/output addresses ---------------------------
+
+ // Compute the input address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes +
+ x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
+
+ // Compute the output address
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+ (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) +
+ ((y / (uint)V0) * (uint)dst_stride_y) +
+ ((y % V0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
+
+ // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
+ REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply src_stride_z by DEPTH_GEMM3D
+
+ input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
+
+ // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ input_ptr += z * (uint)src_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ output_ptr += z * (uint)dst_stride_z;
+
+ // ---------------------------Load input values --------------------------------
+
+ // Load values from the LHS matrix
+ LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+ BOUNDARY_CONDITION_X(x, a0);
+#if M0 > 1
+ BOUNDARY_CONDITION_X(x, a1);
+#endif // M0 > 1
+#if M0 > 2
+ BOUNDARY_CONDITION_X(x, a2);
+#endif // M0 > 2
+#if M0 > 3
+ BOUNDARY_CONDITION_X(x, a3);
+#endif // M0 > 3
+#if M0 > 4
+ BOUNDARY_CONDITION_X(x, a4);
+#endif // M0 > 4
+#if M0 > 5
+ BOUNDARY_CONDITION_X(x, a5);
+#endif // M0 > 5
+#if M0 > 6
+ BOUNDARY_CONDITION_X(x, a6);
+#endif // M0 > 6
+#if M0 > 7
+ BOUNDARY_CONDITION_X(x, a7);
+#endif // M0 > 7
+ // ---------------------------Transpose and store block -----------------------
+
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
+#if K0 > 2
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
+#endif // K0 > 2
+#if K0 > 3
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
+#endif // K0 > 3
+#if K0 > 4
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
+#endif // K0 > 4
+#if K0 > 8
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
+ TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
+#endif // K0 > 8
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+
+#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks
+ * of size K0xN0 and stores each one (not transposed) in the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g.
+ * -DSRC_HEIGHT=16)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g.
+ * -DK0=2, -DN0=2).
+ * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at
+ * compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile
+ * time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ * N0: 2,3,4,8,16
+ * K0: 1,2,3,4,8,16
+ * H0: greater than 0
+ *
+ * @param[in] src_ptr Pointer to the source RHS tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in
+ * bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in
+ * bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS
+ * tensor
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+ // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+ // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (N0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+ // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (N0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (N0)
+#endif // defined(INTERLEAVE)
+
+ // Compute source and destination addresses
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+ // ------------------ Compute input/output addresses ---------------------------
+
+ // Compute the input address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes +
+ x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y +
+ z * (uint)src_stride_z;
+
+ // Compute the output address
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+ (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) +
+ ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) +
+ ((x / (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
+
+ // ---------------------------Load input values --------------------------------
+
+ REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a,
+ 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;
+
+ // Load values from the RHS matrix
+ a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+#if K0 > 1
+ if (y * (uint)K0 + 1 < SRC_HEIGHT)
+ {
+ a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+ }
+#endif // K0 > 1
+#if K0 > 2
+ if (y * (uint)K0 + 2 < SRC_HEIGHT)
+ {
+ a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+ }
+#endif // K0 > 2
+#if K0 > 3
+ if (y * (uint)K0 + 3 < SRC_HEIGHT)
+ {
+ a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
+ }
+#endif // K0 > 3
+#if K0 > 4
+ if (y * (uint)K0 + 4 < SRC_HEIGHT)
+ {
+ a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
+ }
+ if (y * (uint)K0 + 5 < SRC_HEIGHT)
+ {
+ a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
+ }
+ if (y * (uint)K0 + 6 < SRC_HEIGHT)
+ {
+ a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
+ }
+ if (y * (uint)K0 + 7 < SRC_HEIGHT)
+ {
+ a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
+ }
+#endif // K0 > 4
+#if K0 > 8
+ if (y * (uint)K0 + 8 < SRC_HEIGHT)
+ {
+ a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
+ }
+ if (y * (uint)K0 + 9 < SRC_HEIGHT)
+ {
+ a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
+ }
+ if (y * (uint)K0 + 10 < SRC_HEIGHT)
+ {
+ aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
+ }
+ if (y * (uint)K0 + 11 < SRC_HEIGHT)
+ {
+ aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
+ }
+ if (y * (uint)K0 + 12 < SRC_HEIGHT)
+ {
+ aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
+ }
+ if (y * (uint)K0 + 13 < SRC_HEIGHT)
+ {
+ aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
+ }
+ if (y * (uint)K0 + 14 < SRC_HEIGHT)
+ {
+ aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
+ }
+ if (y * (uint)K0 + 15 < SRC_HEIGHT)
+ {
+ aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
+ }
+#endif // K0 > 8
+
+ // ---------------------------Store output values ------------------------------
+ REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+ STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#if defined(TRANSPOSE)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks
+ * of size K0xN0 and stores each one (transposed) in the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g.
+ * -DSRC_HEIGHT=16)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g.
+ * -DK0=2, -DN0=2).
+ * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at
+ * compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile
+ * time.
+ * @note The option -DTRANSPOSE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ * N0: 2,3,4,8,16
+ * K0: 2,3,4,8,16
+ * H0: greater than 0
+ *
+ * @param[in] src_ptr Pointer to the source RHS tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in
+ * bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in
+ * bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS
+ * tensor
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+ // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+ // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+ // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+ // Compute source and destination addresses
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+ // ------------------ Compute input/output addresses ---------------------------
+
+ // Compute the input address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes +
+ x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y +
+ z * (uint)src_stride_z;
+
+ // Compute the output address
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+ (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) +
+ ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) +
+ ((x / (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
+
+ // ---------------------------Load input values --------------------------------
+ REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a,
+ 0); // VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0;
+
+ // Load values from the RHS matrix
+ a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+ if (y * (uint)K0 + 1 < SRC_HEIGHT)
+ {
+ a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+ }
+#if K0 > 2
+ if (y * (uint)K0 + 2 < SRC_HEIGHT)
+ {
+ a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+ }
+#endif // K0 > 2
+#if K0 > 3
+ if (y * (uint)K0 + 3 < SRC_HEIGHT)
+ {
+ a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
+ }
+#endif // K0 > 3
+#if K0 > 4
+ if (y * (uint)K0 + 4 < SRC_HEIGHT)
+ {
+ a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
+ }
+ if (y * (uint)K0 + 5 < SRC_HEIGHT)
+ {
+ a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
+ }
+ if (y * (uint)K0 + 6 < SRC_HEIGHT)
+ {
+ a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
+ }
+ if (y * (uint)K0 + 7 < SRC_HEIGHT)
+ {
+ a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
+ }
+#endif // K0 > 4
+#if K0 > 8
+ if (y * (uint)K0 + 8 < SRC_HEIGHT)
+ {
+ a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
+ }
+ if (y * (uint)K0 + 9 < SRC_HEIGHT)
+ {
+ a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
+ }
+ if (y * (uint)K0 + 10 < SRC_HEIGHT)
+ {
+ aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
+ }
+ if (y * (uint)K0 + 11 < SRC_HEIGHT)
+ {
+ aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
+ }
+ if (y * (uint)K0 + 12 < SRC_HEIGHT)
+ {
+ aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
+ }
+ if (y * (uint)K0 + 13 < SRC_HEIGHT)
+ {
+ aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
+ }
+ if (y * (uint)K0 + 14 < SRC_HEIGHT)
+ {
+ aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
+ }
+ if (y * (uint)K0 + 15 < SRC_HEIGHT)
+ {
+ aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
+ }
+#endif // K0 > 8
+
+ // ---------------------------Transpose the block ------------------------------
+ REPEAT_VAR_INIT_TO_CONST(
+ N0, VEC_DATA_TYPE(DATA_TYPE, K0), res,
+ 0); // VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0;
+
+#if K0 == 2
+ // This part computes the following transpositions:
+ // 2x2 -> 2x2
+ // 2x4 -> 4x2
+ // 2x8 -> 8x2
+ // 2x16 -> 16x2
+ res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);
+ res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);
+#if N0 > 2
+ res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);
+#endif // N0 > 2
+#if N0 > 3
+ res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);
+#endif // N0 > 3
+#if N0 > 4
+ res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);
+ res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);
+ res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);
+ res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);
+#endif // N0 > 4
+#if N0 > 8
+ res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);
+ res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);
+ resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);
+ resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);
+ resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);
+ resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);
+ resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);
+ resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);
+#endif // N0 > 8
+
+#elif K0 == 3 // K0 == 2
+ // This part computes the following transpositions:
+ // 3x2 -> 2x3
+ // 3x4 -> 4x3
+ // 3x8 -> 8x3
+ // 3x16 -> 16x3
+ res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);
+ res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);
+#if N0 > 2
+ res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);
+#endif // N0 > 2
+#if N0 > 3
+ res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);
+#endif // N0 > 3
+#if N0 > 4
+ res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);
+ res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);
+ res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);
+ res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);
+#endif // N0 > 4
+#if N0 > 8
+ res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);
+ res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);
+ resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);
+ resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);
+ resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);
+ resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);
+ resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);
+ resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);
+#endif // N0 > 8
+
+#elif K0 == 4 // K0 == 4
+ // This part computes the following transpositions:
+ // 4x2 -> 2x4
+ // 4x4 -> 4x4
+ // 4x8 -> 8x4
+ // 4x16 -> 16x4
+ res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);
+ res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);
+#if N0 > 2
+ res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);
+#endif // N0 > 2
+#if N0 > 3
+ res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);
+#endif // N0 > 3
+#if N0 > 4
+ res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);
+ res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);
+ res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);
+ res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);
+#endif // N0 > 4
+#if N0 > 8
+ res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);
+ res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);
+ resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);
+ resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);
+ resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);
+ resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);
+ resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);
+ resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);
+#endif // N0 > 8
+
+#elif K0 == 8 // K0 == 8
+ // This part computes the following transpositions:
+ // 8x2 -> 2x8
+ // 8x4 -> 4x8
+ // 8x8 -> 8x8
+ // 8x16 -> 16x8
+ res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);
+ res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);
+#if N0 > 2
+ res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);
+#endif // N0 > 2
+#if N0 > 3
+ res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);
+#endif // N0 > 3
+#if N0 > 4
+ res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);
+ res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);
+ res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);
+ res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);
+#endif // N0 > 4
+#if N0 > 8
+ res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);
+ res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);
+ resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);
+ resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);
+ resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);
+ resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);
+ resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);
+ resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);
+#endif // N0 > 8
+
+#elif K0 == 16 // K0 == 16
+
+ // This part computes the following transpositions:
+ // 16x2 -> 2x16
+ // 16x4 -> 4x16
+ // 16x8 -> 8x16
+ // 16x16 -> 16x16
+ res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,
+ a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);
+ res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,
+ a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);
+#if N0 > 2
+ res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,
+ a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);
+#endif // N0 > 2
+#if N0 > 3
+ res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,
+ a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);
+#endif // N0 > 3
+#if N0 > 4
+ res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,
+ a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);
+ res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,
+ a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);
+ res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,
+ a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);
+ res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,
+ a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);
+#endif // N0 > 4
+#if N0 > 8
+ res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,
+ a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);
+ res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,
+ a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);
+ resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,
+ a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);
+ resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,
+ a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);
+ resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,
+ a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);
+ resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,
+ a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);
+ resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,
+ a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);
+ resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,
+ a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);
+#endif // N0 > 8
+
+#else // N0 == 16
+#error "Not supported N0 value"
+#endif // N0 > 2
+
+ // ---------------------------Store the output values ------------------------------
+ REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+ STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(TRANSPOSE)
+#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && \
+ defined(M) && defined(N) && defined(K)
+
+#define CONCAT(a, b) a##b
+
+#define ARM_DOT1(a, b, c) ({ c = fma(a, b, c); })
+#define ARM_DOT2(a, b, c) \
+ ({ \
+ c = fma(a.s0, b.s0, c); \
+ c = fma(a.s1, b.s1, c); \
+ })
+#define ARM_DOT3(a, b, c) \
+ ({ \
+ ARM_DOT2(a, b, c); \
+ c = fma((a.s2), (b.s2), c); \
+ })
+#define ARM_DOT4(a, b, c) \
+ ({ \
+ ARM_DOT3(a, b, c); \
+ c = fma((a.s3), (b.s3), c); \
+ })
+#define ARM_DOT8(a, b, c) \
+ ({ \
+ ARM_DOT4((a.lo), (b.lo), c); \
+ ARM_DOT4((a.hi), (b.hi), c); \
+ })
+#define ARM_DOT16(a, b, c) \
+ ({ \
+ ARM_DOT8((a.lo), (b.lo), c); \
+ ARM_DOT8((a.hi), (b.hi), c); \
+ })
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##2), (c.s2)); \
+ })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##2), (c.s2)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##3), (c.s3)); \
+ })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##2), (c.s2)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##3), (c.s3)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##4), (c.s4)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##5), (c.s5)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##6), (c.s6)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##7), (c.s7)); \
+ })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##0), (c.s0)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##1), (c.s1)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##2), (c.s2)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##3), (c.s3)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##4), (c.s4)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##5), (c.s5)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##6), (c.s6)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##7), (c.s7)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##8), (c.s8)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##9), (c.s9)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##A), (c.sA)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##B), (c.sB)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##C), (c.sC)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##D), (c.sD)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##E), (c.sE)); \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b##F), (c.sF)); \
+ })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ * The LHS matrix is NOT reshaped
+ * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK
+ * (e.g. -DM=52, -DN=30 and -DK=90)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at
+ * compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16
+ * - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type:
+ * F16/F32
+ * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in] lhs_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in] lhs_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
+ * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in] rhs_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in] rhs_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS
+ * reshaped matrix
+ * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in] bias_step_x (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in] bias_step_y (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix
+ * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+ IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst), uint lhs_stride_z,
+ uint rhs_stride_z,
+#if defined(BETA)
+ uint bias_stride_z,
+#endif // defined(BETA)
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+ // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+ if ((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+ // Compute LHS matrix address
+ uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+ // Compute RHS reshaped matrix address
+ uint rhs_offset = rhs_offset_first_element_in_bytes +
+ (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) +
+ (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+ REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+ lhs_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply lhs_stride_z by DEPTH_GEMM3D
+ lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c,
+ 0); // VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+ int i = 0;
+ for (; i <= (K - K0); i += K0)
+ {
+ // Supported cases (M0, K0):
+ // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+ // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+ // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+ // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+ // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+ // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+ // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+ // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+ // Load values from LHS matrix
+ LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+ // Load values from RHS reshaped matrix
+ LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+ // Accumulate
+ ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+ ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+ ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+ ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+ ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+ ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+ ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+ ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+ lhs_offset += K0 * sizeof(DATA_TYPE);
+ rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+ }
+
+ // Left-over accumulations
+ for (; i < K; ++i)
+ {
+ // Load values from LHS matrix
+ LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+ // Load values from RHS reshaped matrix
+ LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+ // Accumulate
+ ARM_DOT_K0XN0(1, a0, b, c0);
+#if M0 > 1
+ ARM_DOT_K0XN0(1, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+ ARM_DOT_K0XN0(1, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+ ARM_DOT_K0XN0(1, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+ ARM_DOT_K0XN0(1, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+ ARM_DOT_K0XN0(1, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+ ARM_DOT_K0XN0(1, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+ ARM_DOT_K0XN0(1, a7, b, c7);
+#endif // M0 > 7
+
+ lhs_offset += sizeof(DATA_TYPE);
+ rhs_offset += sizeof(DATA_TYPE);
+ }
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+ (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+ dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+ (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+ LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+ (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+ (get_global_id(1) * (uint)M0 * bias_stride_y) +
+ get_global_id(2) * bias_stride_z;
+
+ LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias
+ ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+ ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+ // Store output block
+ STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#define VFMA(a, b, c) ({ c = fma(a, b, c); })
+
+#if M0 == 1
+#define LD_RHS_VFMA_M0xN0(i, a, c) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ b = VLOAD(N0)( \
+ 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ })
+#elif M0 == 2 // M0 == 2
+#define LD_RHS_VFMA_M0xN0(i, a, c) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ b = VLOAD(N0)( \
+ 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ })
+#elif M0 == 3 // M0 == 3
+#define LD_RHS_VFMA_M0xN0(i, a, c) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ b = VLOAD(N0)( \
+ 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ })
+#elif M0 == 4 // M0 == 4
+#define LD_RHS_VFMA_M0xN0(i, a, c) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ b = VLOAD(N0)( \
+ 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ })
+#elif M0 == 5 // M0 == 5
+#define LD_RHS_VFMA_M0xN0(i, a, c) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ b = VLOAD(N0)( \
+ 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ })
+#elif M0 == 6 // M0 == 6
+#define LD_RHS_VFMA_M0xN0(i, a, c) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ b = VLOAD(N0)( \
+ 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+ })
+#elif M0 == 7 // M0 == 7
+#define LD_RHS_VFMA_M0xN0(i, a, c) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ b = VLOAD(N0)( \
+ 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+ })
+#elif M0 == 8 // M0 == 8
+#define LD_RHS_VFMA_M0xN0(i, a, c) \
+ ({ \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ b = VLOAD(N0)( \
+ 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
+ })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ * The LHS matrix is NOT reshaped
+ * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK
+ * (e.g. -DM=52, -DN=30 and -DK=90).
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at
+ * compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16
+ * - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type:
+ * F16/F32
+ * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in] lhs_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in] lhs_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
+ * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in] rhs_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in] rhs_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS
+ * reshaped matrix
+ * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in] bias_step_x (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in] bias_step_y (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix
+ * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+ IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst), uint lhs_stride_z,
+ uint rhs_stride_z,
+#if defined(BETA)
+ uint bias_stride_z,
+#endif // defined(BETA)
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+ // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+ if ((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+ // Compute LHS matrix address
+ uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+ // Compute RHS reshaped matrix address
+ uint rhs_offset = rhs_offset_first_element_in_bytes +
+ (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) +
+ (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); // uint zin0=0,zin1=0,zin2=0,... zin7=0;
+ REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); // uint zero0=0,zero1=0,zero2=0,... zero7=0;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+
+ // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+ lhs_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply lhs_stride_z by DEPTH_GEMM3D
+ lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c,
+ 0); // VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+ int i = 0;
+ for (; i <= (K - K0); i += K0)
+ {
+ // Supported cases (M0, K0):
+ // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+ // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+ // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+ // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+ // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+ // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+ // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+ // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+ // Load values from LHS matrix
+ LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
+
+ LD_RHS_VFMA_M0xN0(0, a, c);
+ LD_RHS_VFMA_M0xN0(1, a, c);
+#if K0 > 2
+ LD_RHS_VFMA_M0xN0(2, a, c);
+#endif // K0 > 2
+#if K0 > 3
+ LD_RHS_VFMA_M0xN0(3, a, c);
+#endif // K0 > 3
+#if K0 > 4
+ LD_RHS_VFMA_M0xN0(4, a, c);
+ LD_RHS_VFMA_M0xN0(5, a, c);
+ LD_RHS_VFMA_M0xN0(6, a, c);
+ LD_RHS_VFMA_M0xN0(7, a, c);
+#endif // K0 > 4
+#if K0 > 8
+ LD_RHS_VFMA_M0xN0(8, a, c);
+ LD_RHS_VFMA_M0xN0(9, a, c);
+ LD_RHS_VFMA_M0xN0(A, a, c);
+ LD_RHS_VFMA_M0xN0(B, a, c);
+ LD_RHS_VFMA_M0xN0(C, a, c);
+ LD_RHS_VFMA_M0xN0(D, a, c);
+ LD_RHS_VFMA_M0xN0(E, a, c);
+ LD_RHS_VFMA_M0xN0(F, a, c);
+#endif // K0 > 8
+
+ lhs_offset += K0 * sizeof(DATA_TYPE);
+ rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
+ }
+
+ // Left-over accumulations
+ for (; i < K; ++i)
+ {
+ // Load values from LHS matrix
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+ LD_RHS_VFMA_M0xN0(0, a, c);
+
+ lhs_offset += sizeof(DATA_TYPE);
+ rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
+ }
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+ (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+ dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+ (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+ LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+ (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+ (get_global_id(1) * (uint)M0 * bias_stride_y) +
+ get_global_id(2) * bias_stride_z;
+
+ LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias
+ ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+ ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+ // Store output block
+ STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) &&
+ // defined(M) && defined(N) && defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && \
+ defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
+
+#if defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c += a.s0 * b.s0; \
+ c += a.s1 * b.s1; \
+ })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c += a.s0 * b.s0; \
+ c += a.s1 * b.s1; \
+ c += a.s2 * b.s2; \
+ })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c += a.s0 * b.s0; \
+ c += a.s1 * b.s1; \
+ c += a.s2 * b.s2; \
+ c += a.s3 * b.s3; \
+ })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c += a.s0 * b.s0; \
+ c += a.s1 * b.s1; \
+ c += a.s2 * b.s2; \
+ c += a.s3 * b.s3; \
+ c += a.s4 * b.s4; \
+ c += a.s5 * b.s5; \
+ c += a.s6 * b.s6; \
+ c += a.s7 * b.s7; \
+ })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c += a.s0 * b.s0; \
+ c += a.s1 * b.s1; \
+ c += a.s2 * b.s2; \
+ c += a.s3 * b.s3; \
+ c += a.s4 * b.s4; \
+ c += a.s5 * b.s5; \
+ c += a.s6 * b.s6; \
+ c += a.s7 * b.s7; \
+ c += a.s8 * b.s8; \
+ c += a.s9 * b.s9; \
+ c += a.sA * b.sA; \
+ c += a.sB * b.sB; \
+ c += a.sC * b.sC; \
+ c += a.sD * b.sD; \
+ c += a.sE * b.sE; \
+ c += a.sF * b.sF; \
+ })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#else // defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c = fma(a.s0, b.s0, c); \
+ c = fma(a.s1, b.s1, c); \
+ })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c = fma(a.s0, b.s0, c); \
+ c = fma(a.s1, b.s1, c); \
+ c = fma(a.s2, b.s2, c); \
+ })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c = fma(a.s0, b.s0, c); \
+ c = fma(a.s1, b.s1, c); \
+ c = fma(a.s2, b.s2, c); \
+ c = fma(a.s3, b.s3, c); \
+ })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c = fma(a.s0, b.s0, c); \
+ c = fma(a.s1, b.s1, c); \
+ c = fma(a.s2, b.s2, c); \
+ c = fma(a.s3, b.s3, c); \
+ c = fma(a.s4, b.s4, c); \
+ c = fma(a.s5, b.s5, c); \
+ c = fma(a.s6, b.s6, c); \
+ c = fma(a.s7, b.s7, c); \
+ })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c) \
+ ({ \
+ c = fma(a.s0, b.s0, c); \
+ c = fma(a.s1, b.s1, c); \
+ c = fma(a.s2, b.s2, c); \
+ c = fma(a.s3, b.s3, c); \
+ c = fma(a.s4, b.s4, c); \
+ c = fma(a.s5, b.s5, c); \
+ c = fma(a.s6, b.s6, c); \
+ c = fma(a.s7, b.s7, c); \
+ c = fma(a.s8, b.s8, c); \
+ c = fma(a.s9, b.s9, c); \
+ c = fma(a.sA, b.sA, c); \
+ c = fma(a.sB, b.sB, c); \
+ c = fma(a.sC, b.sC, c); \
+ c = fma(a.sD, b.sD, c); \
+ c = fma(a.sE, b.sE, c); \
+ c = fma(a.sF, b.sF, c); \
+ })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#endif // defined(MIXED_PRECISION)
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(a, b, c) \
+ ({ \
+ ARM_DOT_K0((a), (b##0), (c.s0)); \
+ ARM_DOT_K0((a), (b##1), (c.s1)); \
+ })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(a, b, c) \
+ ({ \
+ ARM_DOT_K0((a), (b##0), (c.s0)); \
+ ARM_DOT_K0((a), (b##1), (c.s1)); \
+ ARM_DOT_K0((a), (b##2), (c.s2)); \
+ })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(a, b, c) \
+ ({ \
+ ARM_DOT_K0((a), (b##0), (c.s0)); \
+ ARM_DOT_K0((a), (b##1), (c.s1)); \
+ ARM_DOT_K0((a), (b##2), (c.s2)); \
+ ARM_DOT_K0((a), (b##3), (c.s3)); \
+ })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(a, b, c) \
+ ({ \
+ ARM_DOT_K0((a), (b##0), (c.s0)); \
+ ARM_DOT_K0((a), (b##1), (c.s1)); \
+ ARM_DOT_K0((a), (b##2), (c.s2)); \
+ ARM_DOT_K0((a), (b##3), (c.s3)); \
+ ARM_DOT_K0((a), (b##4), (c.s4)); \
+ ARM_DOT_K0((a), (b##5), (c.s5)); \
+ ARM_DOT_K0((a), (b##6), (c.s6)); \
+ ARM_DOT_K0((a), (b##7), (c.s7)); \
+ })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(a, b, c) \
+ ({ \
+ ARM_DOT_K0((a), (b##0), (c.s0)); \
+ ARM_DOT_K0((a), (b##1), (c.s1)); \
+ ARM_DOT_K0((a), (b##2), (c.s2)); \
+ ARM_DOT_K0((a), (b##3), (c.s3)); \
+ ARM_DOT_K0((a), (b##4), (c.s4)); \
+ ARM_DOT_K0((a), (b##5), (c.s5)); \
+ ARM_DOT_K0((a), (b##6), (c.s6)); \
+ ARM_DOT_K0((a), (b##7), (c.s7)); \
+ ARM_DOT_K0((a), (b##8), (c.s8)); \
+ ARM_DOT_K0((a), (b##9), (c.s9)); \
+ ARM_DOT_K0((a), (b##A), (c.sA)); \
+ ARM_DOT_K0((a), (b##B), (c.sB)); \
+ ARM_DOT_K0((a), (b##C), (c.sC)); \
+ ARM_DOT_K0((a), (b##D), (c.sD)); \
+ ARM_DOT_K0((a), (b##E), (c.sE)); \
+ ARM_DOT_K0((a), (b##F), (c.sF)); \
+ })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT
+ * transposed The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0
+ * must be transposed
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The data type used for the accumulators must be passed at compile time using
+ * -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
+ * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION
+ * passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52
+ * and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0)
+ * must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS
+ * matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option
+ * -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 2, 3, 4, 5, 6, 7, 8
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16
+ * - V0 >= 1
+ * - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data
+ * type: F16/F32
+ * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in] lhs_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in] lhs_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS
+ * reshaped matrix
+ * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in] rhs_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in] rhs_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS
+ * reshaped matrix
+ * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in] bias_step_x (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in] bias_step_y (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] k Number of columns in LHS matrix and rows in RHS
+ * matrix not reshaped.
+ * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+ IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z,
+ uint rhs_stride_z,
+#if defined(BETA)
+ uint bias_stride_z,
+#endif // defined(BETA)
+ uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+ // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+ // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+#if defined(DUMMY_WORK_ITEMS)
+ if ((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+ // Compute LHS matrix address
+ __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes +
+ (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) +
+ (get_global_id(1) / V0) * (uint)lhs_stride_y +
+ (get_global_id(2) * lhs_stride_z);
+
+ // Compute RHS matrix address
+ __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes +
+ (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) +
+ (get_global_id(0) / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ rhs_addr += get_global_id(2) * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+ REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); // uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+ REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+ for (int i = 0; i < k; i += K0)
+ {
+ // Supported cases (M0, K0):
+ // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+ // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+ // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+ // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+ // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+ // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+ // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+ // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+ // Load values from LHS matrix
+ LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
+
+ // Load values from RHS matrix
+ LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+ // Accumulate
+ ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+ ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+ ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+ ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+ ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+ ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+ ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+ ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+ lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
+ rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+ }
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+ (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+ (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D,
+ dst_cross_plane_pad, dst_stride_y);
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+ (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+ LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+ CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+ ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else // defined(MIXED_PRECISION)
+ ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+ (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+ (get_global_id(1) * (uint)M0 * bias_stride_y) +
+ get_global_id(2) * bias_stride_z;
+
+ LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias
+#if defined(MIXED_PRECISION)
+ CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+ ADD_BLOCK(M0, c, bias_hp);
+#else // defined(MIXED_PRECISION)
+ ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+ ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
+#else // defined(MIXED_PRECISION)
+ ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+ // Store output block
+#if defined(MIXED_PRECISION)
+ CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#else // defined(MIXED_PRECISION)
+ STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#if defined(LHS_TRANSPOSE)
+
+#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
+
+#if defined(MIXED_PRECISION)
+
+#if (GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c) \
+ c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * \
+ (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c) \
+ c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), \
+ (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#else // defined(MIXED_PRECISION
+
+#if (GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#endif // defined(MIXED_PRECISION)
+
+#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) ({ ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); })
+#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \
+ ({ \
+ ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
+ ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
+ })
+#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \
+ ({ \
+ ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \
+ ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
+ })
+#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \
+ ({ \
+ ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \
+ ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
+ })
+#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \
+ ({ \
+ ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \
+ ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
+ ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
+ ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
+ ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
+ })
+
+// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication.
+// K0 = 1 a is the column-vector (transposed) b is the row-vector (not transposed) C is the output
+// matrix Lower case is a vector (a, b) Upper case is a matrix (C)
+#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
+
+#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \
+ ({ ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); })
+#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \
+ ({ \
+ ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \
+ ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
+ })
+#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \
+ ({ \
+ ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \
+ ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
+ })
+#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \
+ ({ \
+ ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \
+ ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
+ })
+#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \
+ ({ \
+ ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \
+ ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
+ ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
+ ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
+ ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
+ })
+#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \
+ ({ \
+ ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \
+ ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
+ ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
+ ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
+ ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
+ ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
+ ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
+ ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
+ ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
+ })
+
+// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
+// The dimensions for this matrix multiplications are defined through M0, N0 and K0
+// The dimensions supported are:
+// M0: 1, 2, 3, 4, 8
+// N0: 1, 2, 3, 4, 8, 16
+// K0: 1, 2, 3, 4, 8, 16
+// This macro calls the vector-by-matrix macro K0 times
+// A, B and C are matrices
+#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
+ CONCAT(ARM_MM_T_NT_M0xN0x, K0) \
+ (M0, N0, TYPE, A, B, C)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be
+ * transposed The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0
+ * must be NOT transposed
+ *
+ * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g.
+ * -DLHS_TRANSPOSE).
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52
+ * and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0)
+ * must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS
+ * matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option
+ * -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 2, 3, 4, 8
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16
+ * - V0 >= 1
+ * - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data
+ * type: F16/F32
+ * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in] lhs_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in] lhs_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS
+ * reshaped matrix
+ * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in] rhs_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in] rhs_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS
+ * reshaped matrix
+ * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in] bias_step_x (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in] bias_step_y (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] k Number of columns in LHS matrix and rows in RHS
+ * matrix not reshaped.
+ * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+ IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z,
+ uint rhs_stride_z,
+#if defined(BETA)
+ uint bias_stride_z,
+#endif // defined(BETA)
+ uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (M0)
+#define LHS_STEP_X ((M0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (M0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+ // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+ // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#endif // defined(RHS_INTERLEAVE)
+
+ const uint x = get_global_id(0);
+ const uint y = get_global_id(1);
+ const uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+ if ((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+ // Compute LHS matrix address
+ __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes +
+ (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) +
+ (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
+
+ // Compute RHS matrix address
+ __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes +
+ (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) +
+ (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ rhs_addr += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+ REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+ __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
+ __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
+
+ for (int i = 0; i < k; i += K0)
+ {
+ VEC_DATA_TYPE(DATA_TYPE, M0)
+ a0 = VLOAD(M0)(0, lhs);
+ VEC_DATA_TYPE(DATA_TYPE, N0)
+ b0 = VLOAD(N0)(0, rhs);
+
+ ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+ lhs += LHS_STEP_X;
+ rhs += RHS_STEP_X;
+
+#if K0 > 1
+ a0 = VLOAD(M0)(0, lhs);
+ b0 = VLOAD(N0)(0, rhs);
+
+ ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+ lhs += LHS_STEP_X;
+ rhs += RHS_STEP_X;
+#endif // K0 > 1
+
+#if K0 > 2
+ a0 = VLOAD(M0)(0, lhs);
+ b0 = VLOAD(N0)(0, rhs);
+
+ ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+ lhs += LHS_STEP_X;
+ rhs += RHS_STEP_X;
+#endif // K0 > 2
+
+#if K0 > 3
+ a0 = VLOAD(M0)(0, lhs);
+ b0 = VLOAD(N0)(0, rhs);
+
+ ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+ lhs += LHS_STEP_X;
+ rhs += RHS_STEP_X;
+#endif // K0 > 3
+
+#if K0 > 4
+ a0 = VLOAD(M0)(0, lhs);
+ b0 = VLOAD(N0)(0, rhs);
+
+ ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+ lhs += LHS_STEP_X;
+ rhs += RHS_STEP_X;
+
+ a0 = VLOAD(M0)(0, lhs);
+ b0 = VLOAD(N0)(0, rhs);
+
+ ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+ lhs += LHS_STEP_X;
+ rhs += RHS_STEP_X;
+
+ a0 = VLOAD(M0)(0, lhs);
+ b0 = VLOAD(N0)(0, rhs);
+
+ ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+ lhs += LHS_STEP_X;
+ rhs += RHS_STEP_X;
+
+ a0 = VLOAD(M0)(0, lhs);
+ b0 = VLOAD(N0)(0, rhs);
+
+ ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+ lhs += LHS_STEP_X;
+ rhs += RHS_STEP_X;
+#endif // K0 > 4
+
+#if K0 > 8
+ a0 = VLOAD(M0)(0, lhs);
+ b0 = VLOAD(N0)(0, rhs);
+
+ ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+ lhs += LHS_STEP_X;
+ rhs += RHS_STEP_X;
+
+ a0 = VLOAD(M0)(0, lhs);
+ b0 = VLOAD(N0)(0, rhs);
+
+ ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+ lhs += LHS_STEP_X;
+ rhs += RHS_STEP_X;
+
+ a0 = VLOAD(M0)(0, lhs);
+ b0 = VLOAD(N0)(0, rhs);
+
+ ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+ lhs += LHS_STEP_X;
+ rhs += RHS_STEP_X;
+
+ a0 = VLOAD(M0)(0, lhs);
+ b0 = VLOAD(N0)(0, rhs);
+
+ ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+ lhs += LHS_STEP_X;
+ rhs += RHS_STEP_X;
+
+ a0 = VLOAD(M0)(0, lhs);
+ b0 = VLOAD(N0)(0, rhs);
+
+ ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+ lhs += LHS_STEP_X;
+ rhs += RHS_STEP_X;
+
+ a0 = VLOAD(M0)(0, lhs);
+ b0 = VLOAD(N0)(0, rhs);
+
+ ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+ lhs += LHS_STEP_X;
+ rhs += RHS_STEP_X;
+
+ a0 = VLOAD(M0)(0, lhs);
+ b0 = VLOAD(N0)(0, rhs);
+
+ ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+ lhs += LHS_STEP_X;
+ rhs += RHS_STEP_X;
+
+ a0 = VLOAD(M0)(0, lhs);
+ b0 = VLOAD(N0)(0, rhs);
+
+ ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+ lhs += LHS_STEP_X;
+ rhs += RHS_STEP_X;
+#endif // K0 > 8
+
+#ifndef LHS_INTERLEAVE
+ lhs += (M0 * K0 * (V0 - 1));
+#endif // LHS_INTERLEAVE
+
+#ifndef RHS_INTERLEAVE
+ rhs += (N0 * K0 * (H0 - 1));
+#endif // RHS_INTERLEAVE
+ }
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+ (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+ dst_stride_y);
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+ __global uchar *bias_addr =
+ bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
+
+ LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+ CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+ ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else // defined(MIXED_PRECISION)
+ ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+ (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) +
+ z * bias_stride_z;
+
+ LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+#if defined(MIXED_PRECISION)
+ CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+ ADD_BLOCK(M0, c, bias_hp);
+#else // defined(MIXED_PRECISION)
+ ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+ ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
+#else // defined(MIXED_PRECISION)
+ ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+ // Store output block
+#if defined(MIXED_PRECISION)
+ CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#else // defined(MIXED_PRECISION)
+ STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#endif // defined(LHS_TRANSPOSE)
+
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) &&
+ // defined(DATA_TYPE)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
+
+#define VFMA(a, b, c) ({ c = fma(a, b, c); })
+
+#if M0 == 1
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+ ({ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); })
+#elif M0 == 2 // M0 == 2
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ })
+#elif M0 == 3 // M0 == 3
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ })
+#elif M0 == 4 // M0 == 4
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ })
+#elif M0 == 5 // M0 == 5
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ })
+#elif M0 == 6 // M0 == 6
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+ })
+#elif M0 == 7 // M0 == 7
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+ })
+#elif M0 == 8 // M0 == 8
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+ ({ \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
+ })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ * The LHS matrix is NOT reshaped
+ * The RHS matrix is NOT reshaped
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK
+ * (e.g. -DM=52, -DN=30 and -DK=90)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g.,
+ * -DK0=2)
+ * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type:
+ * F16/F32
+ * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in] lhs_step_x lhs_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in] lhs_step_y lhs_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix
+ * @param[in] rhs_ptr Pointer to the RHS matrix. Supported data type:
+ * same as @p lhs_ptr
+ * @param[in] rhs_stride_x Stride of the RHS matrix in X dimension (in bytes)
+ * @param[in] rhs_step_x rhs_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] rhs_stride_y Stride of the RHS matrix in Y dimension (in bytes)
+ * @param[in] rhs_step_y rhs_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS matrix
+ * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in] bias_step_x (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in] bias_step_y (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the RHS matrix in Z dimension (in bytes)
+ * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix
+ * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+ IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst), uint lhs_stride_z, uint rhs_stride_z,
+#if defined(BETA)
+ uint bias_stride_z,
+#endif // defined(BETA)
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+ // RHS offset and step X
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+ if ((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+ // Compute LHS matrix address
+ uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+ // Compute RHS matrix address
+ uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
+ REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+ lhs_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply lhs_stride_z by DEPTH_GEMM3D
+ lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c,
+ 0); // VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+ int i = 0;
+ for (; i <= (K - K0); i += K0)
+ {
+ // Supported cases (M0, K0):
+ // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+ // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+ // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+ // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+ // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+ // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+ // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+ // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+ // Load values from LHS matrix
+ LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+ // Load values from RHS matrix
+ LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
+
+ RHS_VFMA_M0xN0(0, a, b0, c);
+ RHS_VFMA_M0xN0(1, a, b1, c);
+#if K0 > 2
+ RHS_VFMA_M0xN0(2, a, b2, c);
+#endif // K0 > 2
+#if K0 > 3
+ RHS_VFMA_M0xN0(3, a, b3, c);
+#endif // K0 > 3
+#if K0 > 4
+ RHS_VFMA_M0xN0(4, a, b4, c);
+ RHS_VFMA_M0xN0(5, a, b5, c);
+ RHS_VFMA_M0xN0(6, a, b6, c);
+ RHS_VFMA_M0xN0(7, a, b7, c);
+#endif // K0 > 4
+#if K0 > 8
+ RHS_VFMA_M0xN0(8, a, b8, c);
+ RHS_VFMA_M0xN0(9, a, b9, c);
+ RHS_VFMA_M0xN0(A, a, bA, c);
+ RHS_VFMA_M0xN0(B, a, bB, c);
+ RHS_VFMA_M0xN0(C, a, bC, c);
+ RHS_VFMA_M0xN0(D, a, bD, c);
+ RHS_VFMA_M0xN0(E, a, bE, c);
+ RHS_VFMA_M0xN0(F, a, bF, c);
+#endif // K0 > 8
+
+ lhs_offset += K0 * sizeof(DATA_TYPE);
+ rhs_offset += K0 * rhs_stride_y;
+ }
+
+ // Left-over accumulations
+ for (; i < K; ++i)
+ {
+ // Load values from LHS matrix
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
+#if M0 > 1
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
+#endif // M0 > 1
+#if M0 > 2
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
+#endif // M0 > 2
+#if M0 > 3
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
+#endif // M0 > 3
+#if M0 > 4
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
+#endif // M0 > 4
+#if M0 > 5
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
+#endif // M0 > 5
+#if M0 > 6
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
+#endif // M0 > 6
+#if M0 > 7
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
+#endif // M0 > 7
+
+ VEC_DATA_TYPE(DATA_TYPE, N0)
+ b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
+ RHS_VFMA_M0xN0(0, a, b, c);
+
+ lhs_offset += sizeof(DATA_TYPE);
+ rhs_offset += rhs_stride_y;
+ }
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+ (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+ dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+ (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+ LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+ (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+ (get_global_id(1) * (uint)M0 * bias_stride_y) +
+ get_global_id(2) * bias_stride_z;
+
+ LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias
+ ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+ ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+ // Store output block
+ STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
+
+#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
+/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between
+ * matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in] src2_step_y (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+ IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst), uint src0_stride_z,
+ uint src1_stride_z,
+#if defined(BETA)
+ uint src2_stride_z,
+#endif // defined(BETA)
+ uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+ int z = get_global_id(2);
+
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
+
+ // src_addr_a = address of matrix A
+ // src_addr_b = address of matrix B
+ int src0_addr_in_bytes =
+ z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+ __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
+
+ // Compute end row address for matrix B
+ __global float *src_end_addr_b = src_addr_b + COLS_B;
+
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
+
+ // Reset accumulators
+ float4 c0 = 0.0f;
+ float4 c1 = 0.0f;
+ float4 c2 = 0.0f;
+ float4 c3 = 0.0f;
+
+ for (; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH));
+ src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ float4 a0 = vload4(0, src_addr_a);
+ float4 b0 = vload4(0, src_addr_b);
+
+ c0 += (float4)a0.s0 * b0;
+ c1 += (float4)a0.s1 * b0;
+ c2 += (float4)a0.s2 * b0;
+ c3 += (float4)a0.s3 * b0;
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+ b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
+
+ c0 += (float4)a0.s0 * b0;
+ c1 += (float4)a0.s1 * b0;
+ c2 += (float4)a0.s2 * b0;
+ c3 += (float4)a0.s3 * b0;
+ }
+
+ for (; src_addr_b < src_end_addr_b;
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ float4 a0 = vload4(0, src_addr_a);
+ float4 b0 = vload4(0, src_addr_b);
+
+ c0 += (float4)a0.s0 * b0;
+ c1 += (float4)a0.s1 * b0;
+ c2 += (float4)a0.s2 * b0;
+ c3 += (float4)a0.s3 * b0;
+ }
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+ // the z dimension in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+ zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+ REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+ LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) +
+ (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+ LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(4, float, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias
+ ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+ ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+ // Store 4x4 block
+ vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+ vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+ vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+ vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between
+ * matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in] src2_step_y (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+#if defined(BETA)
+ IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst), uint src0_stride_z,
+ uint src1_stride_z,
+#if defined(BETA)
+ uint src2_stride_z,
+#endif // defined(BETA)
+ uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+ int z = get_global_id(2);
+
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
+
+ // src_addr_a = address of matrix A
+ // src_addr_b = address of matrix B
+ int src0_addr_in_bytes =
+ z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+ __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
+
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
+
+ // Reset accumulators
+ float4 c0 = 0.0f;
+ float4 c1 = 0.0f;
+ float4 c2 = 0.0f;
+ float4 c3 = 0.0f;
+
+#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))
+
+ int i = 0;
+ for (; i <= (int)(COLS_MTX_B - 4); i += 4)
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ float4 a0 = vload4(0, src_addr_a);
+ float4 b0 = vload4(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+ c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+ c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+ c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+ c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+ c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+ c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+ c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+ c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+ c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+ c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+ c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+ c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+ c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+ c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+ c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = vload4(0, src_addr_a);
+ b0 = vload4(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+ c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+ c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+ c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+ c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+ c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+ c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+ c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+ c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+ c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+ c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+ c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+ c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+ c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+ c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+ c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = vload4(0, src_addr_a);
+ b0 = vload4(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+ c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+ c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+ c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+ c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+ c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+ c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+ c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+ c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+ c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+ c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+ c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+ c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+ c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+ c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+ c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = vload4(0, src_addr_a);
+ b0 = vload4(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+ c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+ c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+ c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+ c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+ c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+ c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+ c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+ c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+ c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+ c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+ c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+ c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+ c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+ c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+ c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+ }
+
+ for (; i < (int)(COLS_MTX_B); ++i)
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ float4 a0 = vload4(0, src_addr_a);
+ float4 b0 = vload4(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+ c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+ c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+ c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+ c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+ c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+ c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+ c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+ c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+ c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+ c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+ c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+ c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+ c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+ c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+ c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+ }
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+ // the z dimension in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+ zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+ REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+ LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) +
+ (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+ LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(4, float, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias
+ ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+ ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+ // Store 4x4 block
+ vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+ vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+ vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+ vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+// Undefine local defines
+#undef COLS_MTX_B
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and
+ * matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in] src2_step_y (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+ IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst), uint src0_stride_z,
+ uint src1_stride_z,
+#if defined(BETA)
+ uint src2_stride_z,
+#endif // defined(BETA)
+ uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+ int z = get_global_id(2);
+
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+ // src_addr_a = address of matrix A
+ // src_addr_b = address of matrix B
+ int src0_addr_in_bytes =
+ z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+ __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+ // Compute end row address for matrix B
+ __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
+
+ // Reset accumulators
+ half8 c0 = 0.0f;
+ half8 c1 = 0.0f;
+ half8 c2 = 0.0f;
+ half8 c3 = 0.0f;
+
+ for (; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH));
+ src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ half4 a0 = vload4(0, src_addr_a);
+ half8 b0 = vload8(0, src_addr_b);
+
+ c0 += (half8)a0.s0 * b0;
+ c1 += (half8)a0.s1 * b0;
+ c2 += (half8)a0.s2 * b0;
+ c3 += (half8)a0.s3 * b0;
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+ b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
+
+ c0 += (half8)a0.s0 * b0;
+ c1 += (half8)a0.s1 * b0;
+ c2 += (half8)a0.s2 * b0;
+ c3 += (half8)a0.s3 * b0;
+ }
+
+ for (; src_addr_b < src_end_addr_b;
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ half4 a0 = vload4(0, src_addr_a);
+ half8 b0 = vload8(0, src_addr_b);
+
+ c0 += (half8)a0.s0 * b0;
+ c1 += (half8)a0.s1 * b0;
+ c2 += (half8)a0.s2 * b0;
+ c3 += (half8)a0.s3 * b0;
+ }
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+ // the z dimension in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+ zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(4, half, c, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+ REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+ LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+ (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+ LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(4, half, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias
+ ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+ ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+ // Store 4x8 block
+ vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+ vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+ vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+ vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and
+ * matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in] src2_step_y (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+#if defined(BETA)
+ IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst), uint src0_stride_z,
+ uint src1_stride_z,
+#if defined(BETA)
+ uint src2_stride_z,
+#endif // defined(BETA)
+ uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+ int z = get_global_id(2);
+
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+ // src_addr_a = address of matrix A
+ // src_addr_b = address of matrix B
+ int src0_addr_in_bytes =
+ z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+ __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+ // Compute end row address for matrix B
+ __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
+
+ // Reset accumulators
+ float8 c0 = 0.0f;
+ float8 c1 = 0.0f;
+ float8 c2 = 0.0f;
+ float8 c3 = 0.0f;
+
+ for (; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH));
+ src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ float4 a0 = convert_float4(vload4(0, src_addr_a));
+ float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+ c0 += (float8)a0.s0 * b0;
+ c1 += (float8)a0.s1 * b0;
+ c2 += (float8)a0.s2 * b0;
+ c3 += (float8)a0.s3 * b0;
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
+ b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));
+
+ c0 += (float8)a0.s0 * b0;
+ c1 += (float8)a0.s1 * b0;
+ c2 += (float8)a0.s2 * b0;
+ c3 += (float8)a0.s3 * b0;
+ }
+
+ for (; src_addr_b < src_end_addr_b;
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ float4 a0 = convert_float4(vload4(0, src_addr_a));
+ float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+ c0 += (float8)a0.s0 * b0;
+ c1 += (float8)a0.s1 * b0;
+ c2 += (float8)a0.s2 * b0;
+ c3 += (float8)a0.s3 * b0;
+ }
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+ // the z dimension in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+ zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+#if defined(BETA)
+ REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+ LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+ float8 bias_f0 = convert_float8(bias0);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(4, c, bias_f0);
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+ (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+ LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+ float8 bias_f0 = convert_float8(bias0);
+ float8 bias_f1 = convert_float8(bias1);
+ float8 bias_f2 = convert_float8(bias2);
+ float8 bias_f3 = convert_float8(bias3);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(4, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias
+ ADD_BLOCK(4, c, bias_f);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+ half8 c_h0 = convert_half8(c0);
+ half8 c_h1 = convert_half8(c1);
+ half8 c_h2 = convert_half8(c2);
+ half8 c_h3 = convert_half8(c3);
+
+#if defined(ACTIVATION_TYPE)
+ ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c_h, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+ // Store 4x8 block
+ vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+ vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+ vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+ vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication
+ * between matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in] src2_step_y (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+#if defined(BETA)
+ IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst), uint src0_stride_z,
+ uint src1_stride_z,
+#if defined(BETA)
+ uint src2_stride_z,
+#endif // defined(BETA)
+ uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+ int z = get_global_id(2);
+
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+ // src_addr_a = address of matrix A
+ // src_addr_b = address of matrix B
+ int src0_addr_in_bytes =
+ z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+ __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+ // Compute end row address for matrix B
+ __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
+
+ // Reset accumulators
+ half8 c0 = 0.0f;
+ half8 c1 = 0.0f;
+ half8 c2 = 0.0f;
+ half8 c3 = 0.0f;
+
+#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))
+
+ int i = 0;
+ for (; i <= (int)(COLS_MTX_B - 4); i += 4)
+ {
+#if MULT_INTERLEAVE4X4_HEIGHT == 1
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ half8 a0 = vload8(0, src_addr_a);
+ half8 b0 = vload8(0, src_addr_b);
+
+ src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c0 = fma((half8)a0.s0, b0, c0);
+ c1 = fma((half8)a0.s1, b0, c1);
+ c2 = fma((half8)a0.s2, b0, c2);
+ c3 = fma((half8)a0.s3, b0, c3);
+
+ // Load values from matrix B (transposed)
+ b0 = vload8(0, src_addr_b);
+
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c0 = fma((half8)a0.s4, b0, c0);
+ c1 = fma((half8)a0.s5, b0, c1);
+ c2 = fma((half8)a0.s6, b0, c2);
+ c3 = fma((half8)a0.s7, b0, c3);
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = vload8(0, src_addr_a);
+ b0 = vload8(0, src_addr_b);
+
+ src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c0 = fma((half8)a0.s0, b0, c0);
+ c1 = fma((half8)a0.s1, b0, c1);
+ c2 = fma((half8)a0.s2, b0, c2);
+ c3 = fma((half8)a0.s3, b0, c3);
+
+ // Load values from matrix B (transposed)
+ b0 = vload8(0, src_addr_b);
+
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c0 = fma((half8)a0.s4, b0, c0);
+ c1 = fma((half8)a0.s5, b0, c1);
+ c2 = fma((half8)a0.s6, b0, c2);
+ c3 = fma((half8)a0.s7, b0, c3);
+#else // MULT_INTERLEAVE4X4_HEIGHT == 1
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ half4 a0 = vload4(0, src_addr_a);
+ half8 b0 = vload8(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c0 = fma((half8)a0.s0, b0, c0);
+ c1 = fma((half8)a0.s1, b0, c1);
+ c2 = fma((half8)a0.s2, b0, c2);
+ c3 = fma((half8)a0.s3, b0, c3);
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = vload4(0, src_addr_a);
+ b0 = vload8(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c0 = fma((half8)a0.s0, b0, c0);
+ c1 = fma((half8)a0.s1, b0, c1);
+ c2 = fma((half8)a0.s2, b0, c2);
+ c3 = fma((half8)a0.s3, b0, c3);
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = vload4(0, src_addr_a);
+ b0 = vload8(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c0 = fma((half8)a0.s0, b0, c0);
+ c1 = fma((half8)a0.s1, b0, c1);
+ c2 = fma((half8)a0.s2, b0, c2);
+ c3 = fma((half8)a0.s3, b0, c3);
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = vload4(0, src_addr_a);
+ b0 = vload8(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c0 = fma((half8)a0.s0, b0, c0);
+ c1 = fma((half8)a0.s1, b0, c1);
+ c2 = fma((half8)a0.s2, b0, c2);
+ c3 = fma((half8)a0.s3, b0, c3);
+#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
+ }
+
+ for (; i < (int)(COLS_MTX_B); ++i)
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ half4 a0 = vload4(0, src_addr_a);
+ half8 b0 = vload8(0, src_addr_b);
+
+ src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+ src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+ c0 = fma((half8)a0.s0, b0, c0);
+ c1 = fma((half8)a0.s1, b0, c1);
+ c2 = fma((half8)a0.s2, b0, c2);
+ c3 = fma((half8)a0.s3, b0, c3);
+ }
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+ // the z dimension in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+ zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(4, half, c, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+ REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+ LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+ (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+ LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(4, half, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias
+ ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+ ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+ // Store 4x8 block
+ vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+ vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+ vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+ vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+// Undefine local defines
+#undef COLS_MTX_B
+
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
+
+#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && \
+ (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+#if defined(DATA_TYPE)
+#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not been reshaped.
+ *
+ * @note This OpenCL kernel works with floating point data types (F16/F32)
+ * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g.
+ * -DDATA_TYPE=float)
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
+ * @note The number of matrix A columns and the optional alpha's value need to be passed at compile
+ * time using -DCOLS_A and -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data
+ * types: F16/F32
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in] src2_step_y (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for
+ * the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+ IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst), uint src0_stride_z, uint src1_stride_z,
+#if defined(BETA)
+ uint src2_stride_z,
+#endif // defined(BETA)
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+ // Compute starting address for matrix A and Matrix B
+ int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+ // Update address for the matrix A
+ src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+ // Update address for the matrix B
+ src_addr.s1 += idx * sizeof(DATA_TYPE);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+ // the z dimension in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+ // by HEIGHT_GEMM3D
+ uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+ (uint4)HEIGHT_GEMM3D;
+ zin = min(DEPTH_GEMM3D - 1, zin);
+
+ // Add offset due to the cross plane paddings
+ zin *= (src_cross_plane_pad * src0_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply src0_stride_z by DEPTH_GEMM3D
+ src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));
+
+ VECTOR_TYPE acc0 = 0.0f;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ VECTOR_TYPE acc1 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ VECTOR_TYPE acc2 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ VECTOR_TYPE acc3 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ for (; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE));
+ src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
+ {
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0,
+ src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Load values from matrix B
+ VECTOR_TYPE b0 =
+ VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+ VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(
+ 0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+ // Accumulate
+ acc0 += b0 * (VECTOR_TYPE)a0.s0;
+ acc0 += b1 * (VECTOR_TYPE)a0.s1;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 += b0 * (VECTOR_TYPE)a1.s0;
+ acc1 += b1 * (VECTOR_TYPE)a1.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 += b0 * (VECTOR_TYPE)a2.s0;
+ acc2 += b1 * (VECTOR_TYPE)a2.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 += b0 * (VECTOR_TYPE)a3.s0;
+ acc3 += b1 * (VECTOR_TYPE)a3.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ }
+
+ for (; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
+ {
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else // defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Load values from matrix B
+ VECTOR_TYPE b0 =
+ VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+
+ // Accumulate
+ acc0 += b0 * (VECTOR_TYPE)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 += b0 * (VECTOR_TYPE)a1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 += b0 * (VECTOR_TYPE)a2;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 += b0 * (VECTOR_TYPE)a3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ }
+
+ int z = get_global_id(2);
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+ // the z dimension in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+ // by HEIGHT_GEMM3D
+ zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+ (uint4)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (dst_cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+ REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes +
+ (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE));
+
+ LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y,
+ zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes +
+ (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) +
+ (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+ get_global_id(2) * src2_stride_z;
+
+ LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias,
+ src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+ // c = c + bias
+ ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+ ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+ // Store output block
+ STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc,
+ dst_addr, dst_stride_y, zout.s);
+}
+#endif // defined(DATA_TYPE)
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not been reshaped
+ *
+ * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma
+ * units.
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel
+ * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in] src2_step_y (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+ IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst), uint src0_stride_z,
+ uint src1_stride_z,
+#if defined(BETA)
+ uint src2_stride_z,
+#endif // defined(BETA)
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+ // Compute starting address for matrix A and matrix B
+ int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+ // Update address for matrix A
+ src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+ // Update address for matrix B
+ src_addr.s1 += idx * sizeof(float);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+ // the z dimension in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+ // by HEIGHT_GEMM3D
+ uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+ (uint4)HEIGHT_GEMM3D;
+ zin = min(DEPTH_GEMM3D - 1, zin);
+
+ // Add offset due to the cross plane paddings
+ zin *= (src_cross_plane_pad * src0_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply src0_stride_z by DEPTH_GEMM3D
+ src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ // Initialize accumulators
+ float4 acc0 = 0.0f;
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ float4 acc1 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ float4 acc2 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ float4 acc3 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ // A and B src indices get incremented at the same time.
+ int i = 0;
+ for (; i <= ((int)COLS_A - 4); i += 4)
+ {
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A and matrix B
+ LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y,
+ zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A and matrix B
+ float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+
+ // Multiply and accumulate
+ acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
+ acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
+ acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);
+ acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+ acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);
+ acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);
+ acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);
+ acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+ acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);
+ acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);
+ acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);
+ acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);
+ acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);
+ acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);
+ acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ // Load values from matrix A and matrix B
+ b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+
+ // Multiply and accumulate
+ acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);
+ acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);
+ acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);
+ acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+ acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);
+ acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);
+ acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);
+ acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+ acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);
+ acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);
+ acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);
+ acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);
+ acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);
+ acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);
+ acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ // Load values from matrix A and matrix B
+ b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+
+ // Multiply and accumulate
+ acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);
+ acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);
+ acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);
+ acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+ acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);
+ acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);
+ acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);
+ acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+ acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);
+ acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);
+ acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);
+ acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);
+ acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);
+ acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);
+ acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ // Load values from matrix A and matrix B
+ b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+
+ // Multiply and accumulate
+ acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);
+ acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);
+ acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);
+ acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+ acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);
+ acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);
+ acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);
+ acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+ acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);
+ acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);
+ acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);
+ acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);
+ acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);
+ acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);
+ acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ src_addr.s0 += 4 * sizeof(float);
+ }
+
+ for (; i < (int)COLS_A; ++i)
+ {
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else // defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Load values from matrix B
+ float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+
+ // Multiply and accumulate
+ acc0.s0 = fma(a0, b0.s0, acc0.s0);
+ acc0.s1 = fma(a0, b0.s1, acc0.s1);
+ acc0.s2 = fma(a0, b0.s2, acc0.s2);
+ acc0.s3 = fma(a0, b0.s3, acc0.s3);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1.s0 = fma(a1, b0.s0, acc1.s0);
+ acc1.s1 = fma(a1, b0.s1, acc1.s1);
+ acc1.s2 = fma(a1, b0.s2, acc1.s2);
+ acc1.s3 = fma(a1, b0.s3, acc1.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2.s0 = fma(a2, b0.s0, acc2.s0);
+ acc2.s1 = fma(a2, b0.s1, acc2.s1);
+ acc2.s2 = fma(a2, b0.s2, acc2.s2);
+ acc2.s3 = fma(a2, b0.s3, acc2.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3.s0 = fma(a3, b0.s0, acc3.s0);
+ acc3.s1 = fma(a3, b0.s1, acc3.s1);
+ acc3.s2 = fma(a3, b0.s2, acc3.s2);
+ acc3.s3 = fma(a3, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ src_addr.s0 += sizeof(float);
+ }
+
+ int z = get_global_id(2);
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+ // the z dimension in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+ // by HEIGHT_GEMM3D
+ zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+ (uint4)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (dst_cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+ REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+ LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+ // acc = acc + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) +
+ (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+ get_global_id(2) * src2_stride_z;
+
+ LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
+#endif // UNIT_BIAS
+
+ // acc = acc + bias
+ ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+ ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+ // Store the output block
+ vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not been reshaped
+ *
+ * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma
+ * units. This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or
+ * equal to 1000.
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel
+ * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if
+ * alpha!=1.0f.
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in] src2_step_y (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+#if defined(BETA)
+ IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst), uint src0_stride_z,
+ uint src1_stride_z,
+#if defined(BETA)
+ uint src2_stride_z,
+#endif // defined(BETA)
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ // Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for
+ // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+ // Compute starting address for matrix A and Matrix B
+ int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+ // Update address for the matrix A
+ src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+ // Update address for the matrix B
+ src_addr.s1 += idx * sizeof(float);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+ // the z dimension in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+ // by HEIGHT_GEMM3D
+ uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+ (uint4)HEIGHT_GEMM3D;
+ zin = min(DEPTH_GEMM3D - 1, zin);
+
+ // Add offset due to the cross plane paddings
+ zin *= (src_cross_plane_pad * src0_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply src0_stride_z by DEPTH_GEMM3D
+ src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ // Initialize accumulators
+ float2 acc0 = 0.0f;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ float2 acc1 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ float2 acc2 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ float2 acc3 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ // A and B src indices get incremented at the same time.
+ int i = 0;
+ for (; i <= ((int)COLS_A - 8); i += 8)
+ {
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));
+#else // defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Load values from matrix B
+ float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+
+ // Multiply and accumulate
+ acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
+ acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);
+ acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);
+ acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);
+ acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);
+ acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);
+ acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);
+ acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);
+
+ acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
+ acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);
+ acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);
+ acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);
+ acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);
+ acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);
+ acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);
+ acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if defined(REINTERPRET_INPUT_AS_3D)
+ a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#else // defined(REINTERPRET_INPUT_AS_3D)
+ a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+ acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);
+ acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);
+ acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);
+ acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);
+ acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);
+ acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);
+ acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);
+ acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);
+
+ acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);
+ acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);
+ acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);
+ acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);
+ acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);
+ acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);
+ acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);
+ acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if defined(REINTERPRET_INPUT_AS_3D)
+ a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#else // defined(REINTERPRET_INPUT_AS_3D)
+ a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+ acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);
+ acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);
+ acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);
+ acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);
+ acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);
+ acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);
+ acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);
+ acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);
+
+ acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);
+ acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);
+ acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);
+ acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);
+ acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);
+ acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);
+ acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);
+ acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if defined(REINTERPRET_INPUT_AS_3D)
+ a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#else // defined(REINTERPRET_INPUT_AS_3D)
+ a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+ acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);
+ acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);
+ acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);
+ acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);
+ acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);
+ acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);
+ acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);
+ acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);
+
+ acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);
+ acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);
+ acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);
+ acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);
+ acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);
+ acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);
+ acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);
+ acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ src_addr.s0 += sizeof(float) * 8;
+ }
+ // float size increment
+ for (; i < (int)COLS_A; ++i)
+ {
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else // defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Load values from matrix B
+ float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+
+ // Multiply and accumulate
+ acc0.s0 = fma(a0, b0.s0, acc0.s0);
+ acc0.s1 = fma(a0, b0.s1, acc0.s1);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1.s0 = fma(a1, b0.s0, acc1.s0);
+ acc1.s1 = fma(a1, b0.s1, acc1.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2.s0 = fma(a2, b0.s0, acc2.s0);
+ acc2.s1 = fma(a2, b0.s1, acc2.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3.s0 = fma(a3, b0.s0, acc3.s0);
+ acc3.s1 = fma(a3, b0.s1, acc3.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ src_addr.s0 += sizeof(float);
+ }
+
+ int z = get_global_id(2);
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+ // the z dimension in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+ // by HEIGHT_GEMM3D
+ zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+ (uint4)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (dst_cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+ REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));
+
+ LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+ // acc = acc + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) +
+ (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+ get_global_id(2) * src2_stride_z;
+
+ LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
+#endif // UNIT_BIAS
+
+ // acc = acc + bias
+ ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+ ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+ // Store the output block
+ vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating
+ * the result in a 32 floating point variable.
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel
+ * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in] src2_step_y (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+#if defined(BETA)
+ IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst), uint src0_stride_z,
+ uint src1_stride_z,
+#if defined(BETA)
+ uint src2_stride_z,
+#endif // defined(BETA)
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+ // Compute starting address for matrix A and Matrix B
+ int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+ // Update address for the matrix A
+ src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+ // Update address for the matrix B
+ src_addr.s1 += idx * sizeof(half);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+ // the z dimension in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+ // by HEIGHT_GEMM3D
+ uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+ (uint4)HEIGHT_GEMM3D;
+ zin = min(DEPTH_GEMM3D - 1, zin);
+
+ // Add offset due to the cross plane paddings
+ zin *= (src_cross_plane_pad * src0_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply src0_stride_z by DEPTH_GEMM3D
+ src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ float8 acc0 = 0.0h;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ float8 acc1 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ float8 acc2 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ float8 acc3 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ int i = 0;
+ for (; i <= ((int)COLS_A - 4); i += 4)
+ {
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y,
+ zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Load values from matrix B
+ float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+ src_addr.s1 += src1_stride_y;
+
+ // Accumulate
+ acc0 = fma(b0, (float8)a0.s0, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (float8)a1.s0, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (float8)a2.s0, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (float8)a3.s0, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+ src_addr.s1 += src1_stride_y;
+ acc0 = fma(b0, (float8)a0.s1, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (float8)a1.s1, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (float8)a2.s1, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (float8)a3.s1, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+ src_addr.s1 += src1_stride_y;
+ acc0 = fma(b0, (float8)a0.s2, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (float8)a1.s2, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (float8)a2.s2, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (float8)a3.s2, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+ src_addr.s1 += src1_stride_y;
+ acc0 = fma(b0, (float8)a0.s3, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (float8)a1.s3, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (float8)a2.s3, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (float8)a3.s3, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ src_addr.s0 += 4 * sizeof(half);
+ }
+
+ for (; i < (int)COLS_A; ++i)
+ {
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else // defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Load values from matrix B
+ float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+
+ src_addr += (int2)(sizeof(half), src1_stride_y);
+
+ // Accumulate
+ acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ }
+
+ int z = get_global_id(2);
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+ // the z dimension in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+ // by HEIGHT_GEMM3D
+ zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+ (uint4)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (dst_cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+#if defined(BETA)
+ REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+ LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+ float8 bias_f0 = convert_float8(bias0);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+ // acc = acc + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0);
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+ (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+ get_global_id(2) * src2_stride_z;
+
+ LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+ float8 bias_f0 = convert_float8(bias0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ float8 bias_f1 = convert_float8(bias1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ float8 bias_f2 = convert_float8(bias2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ float8 bias_f3 = convert_float8(bias3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+ // acc = acc + bias
+ ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+ half8 acc_h0 = convert_half8(acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ half8 acc_h1 = convert_half8(acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ half8 acc_h2 = convert_half8(acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ half8 acc_h3 = convert_half8(acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+#if defined(ACTIVATION_TYPE)
+ ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc_h, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+ // Store the output block
+ STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s);
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma
+ * units.
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel
+ * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in] src2_step_x (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in] src2_step_y (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+ IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+ IMAGE_DECLARATION(dst), uint src0_stride_z,
+ uint src1_stride_z,
+#if defined(BETA)
+ uint src2_stride_z,
+#endif // defined(BETA)
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+ // Compute starting address for matrix A and Matrix B
+ int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+ // Update address for the matrix A
+ src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+ // Update address for the matrix B
+ src_addr.s1 += idx * sizeof(half);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+ // the z dimension in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+ // by HEIGHT_GEMM3D
+ uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+ (uint4)HEIGHT_GEMM3D;
+ zin = min(DEPTH_GEMM3D - 1, zin);
+
+ // Add offset due to the cross plane paddings
+ zin *= (src_cross_plane_pad * src0_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply src0_stride_z by DEPTH_GEMM3D
+ src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ half8 acc0 = 0.0h;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ half8 acc1 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ half8 acc2 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ half8 acc3 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ int i = 0;
+ for (; i <= ((int)COLS_A - 4); i += 4)
+ {
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y,
+ zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Load values from matrix B
+ half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+
+ // Accumulate
+ acc0 = fma(b0, (half8)a0.s0, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (half8)a1.s0, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (half8)a2.s0, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (half8)a3.s0, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ acc0 = fma(b0, (half8)a0.s1, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (half8)a1.s1, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (half8)a2.s1, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (half8)a3.s1, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ acc0 = fma(b0, (half8)a0.s2, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (half8)a1.s2, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (half8)a2.s2, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (half8)a3.s2, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+ src_addr.s1 += src1_stride_y;
+ acc0 = fma(b0, (half8)a0.s3, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (half8)a1.s3, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (half8)a2.s3, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (half8)a3.s3, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+ src_addr.s0 += 4 * sizeof(half);
+ }
+
+ for (; i < (int)COLS_A; ++i)
+ {
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else // defined(REINTERPRET_INPUT_AS_3D)
+ // Load values from matrix A
+ half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Load values from matrix B
+ half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+
+ src_addr += (int2)(sizeof(half), src1_stride_y);
+
+ // Accumulate
+ acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+ acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+ acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+ }
+
+ int z = get_global_id(2);
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+ uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+ // the z dimension in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+ // by HEIGHT_GEMM3D
+ zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+ (uint4)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (dst_cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+ SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA);
+#endif // defined(ALPHA)
+
+ // Add beta*bias
+#if defined(BETA)
+ REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+ LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+ // acc = acc + bias[broadcasted]
+ ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+ __global uchar *src2_addr =
+ src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+ (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+ get_global_id(2) * src2_stride_z;
+
+ LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+ SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA);
+#endif // UNIT_BIAS
+
+ // acc = acc + bias
+ ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+ ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+ // Store the output block
+ STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s);
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) &&
+ // (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+
+#if defined(BETA)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account
+ * that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types:
+ * F32
+ * @param[in] src_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in] src_step_z dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+ // Compute source and destination addresses
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ // Load values from A x B
+ float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
+
+ // Load values from Matrix C
+ float4 c = vload4(0, (__global float *)src.ptr);
+
+ // Computes alpha * axb + beta * c
+ float4 out = alpha_ab + (float4)BETA * c;
+
+ // Store final result in axb matrix
+ vstore4(out, 0, (__global float *)dst.ptr);
+}
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account
+ * that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types:
+ * F16
+ * @param[in] src_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in] src_step_z dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+ // Compute source and destination addresses
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ // Load values from A x B
+ half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
+
+ // Load values from Matrix C
+ half8 c = vload8(0, (__global half *)src.ptr);
+
+ // Computes alpha * axb + beta * c
+ half8 out = alpha_ab + (half8)BETA * c;
+
+ // Store final result in axb matrix
+ vstore8(out, 0, (__global half *)dst.ptr);
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#endif // defined(BETA)
+
+#if defined(WIDTH_VECTOR_A)
+/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and
+ * matrix B (src1) used for locally connected layer
+ *
+ * @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
+ *
+ * @note The input A and matrix B must not be reshaped
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in] src1_step_z src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0), TENSOR3D_DECLARATION(src1),
+ IMAGE_DECLARATION(dst))
+{
+ int idx = get_global_id(0) * 4;
+ int idy = get_global_id(1);
+
+ // Compute the address for the vector A and matrix B
+ int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy,
+ src1_offset_first_element_in_bytes + src1_stride_z * idy));
+ src_addr.s1 += idx * sizeof(float);
+
+ int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
+
+ float4 acc = 0.0f;
+
+ for (; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float));
+ src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+ {
+ float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
+ float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+ float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+ acc += b0 * (float4)a0.s0;
+ acc += b1 * (float4)a0.s1;
+ }
+
+ for (; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
+ {
+ float a0 = *((__global float *)(src0_ptr + src_addr.s0));
+ float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+
+ acc += b0 * (float4)a0;
+ }
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
+}
+#endif // defined(WIDTH_VECTOR_A)
+
+/** This kernel accumulates each row with the biases vector.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short.
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16.
+ *
+ * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported
+ * data type: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] accum_stride_x Stride of the accmulate tensor in X
+ * dimension (in bytes)
+ * @param[in] accum_step_x accum_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] accum_stride_y Stride of the accumlulate tensor in Y
+ * dimension (in bytes)
+ * @param[in] accum_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the
+ * accumulate tensor
+ * @param[in] biases_ptr Pointer to the biases vector. Same as @p
+ * accum_ptr
+ * @param[in] biases_stride_x Stride of the destination tensor in X
+ * dimension (in bytes)
+ * @param[in] biases_step_x dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ */
+#if defined(DATA_TYPE) && defined(VECTOR_SIZE)
+__kernel void gemm_accumulate_biases(IMAGE_DECLARATION(accum), VECTOR_DECLARATION(biases))
+{
+ Image accum = CONVERT_TO_IMAGE_STRUCT(accum);
+ Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+ // Vector size, e.g. number of vector elements.
+ VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+ accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr);
+ VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+ biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
+ accum_value = biases_value + accum_value;
+ // Store result in the accumulate buffer
+ VSTORE(VECTOR_SIZE)
+ (accum_value, 0, (__global DATA_TYPE *)accum.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(VECTOR_SIZE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h
new file mode 100644
index 000000000..0c75d061f
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h
@@ -0,0 +1,1235 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+
+/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
+ * @name LOAD_ROW_n
+ *
+ * @param[in] N0 The number of rows to load
+ * @param[in] DATA_TYPE The data type of variables
+ * @param[in] BASENAME The basename of the destination variables for the loaded rows
+ * @param[in] PTR The base pointer
+ * @param[in] OFFSET The offset within a row
+ * @param[in] STRIDE_Y The stride value in y-axis direction
+ * @param[in] Z The z-axis offset vector
+ * @{
+ */
+#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
+
+#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
+
+#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
+
+#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
+
+#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
+
+#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
+
+#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
+
+#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
+
+#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
+
+#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
+
+#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
+
+#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
+
+#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
+
+#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
+
+#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
+
+#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
+
+/** @}*/ // end of group LOAD_ROW_n
+
+/** Load Blocks (consecutive rows and columns) with Z offset.
+ * @name LOAD_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
+ *
+ * @param[in] M0 The number of consecutive rows
+ * @param[in] N0 The number of consecutive columns
+ * @param[in] DATA_TYPE The data type of the target
+ * @param[in] BASENAME The basename of the result variables
+ * @param[in] PTR The base pointer for the data
+ * @param[in] OFFSET The offset within a row
+ * @param[in] STRIDE_Y The stride in y-axis direction
+ * @param[in] Z The z-axis offset vector
+ * @{
+ */
+#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+ LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+/** @} */ // end of group LOAD_BLOCK
+
+/** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
+ * @name LOAD_ELEMENT_n
+ *
+ * @param[in] N0 The number of rows to load
+ * @param[in] DATA_TYPE The data type of variables
+ * @param[in] BASENAME The basename of the destination variables for the loaded rows
+ * @param[in] PTR The base pointer
+ * @param[in] OFFSET The offset within a row
+ * @param[in] STRIDE_Y The stride value in y-axis direction
+ * @{
+ */
+#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y));
+
+#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y));
+
+#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y));
+
+#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y));
+
+#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y));
+
+#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y));
+
+#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y));
+
+#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y));
+
+#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y));
+
+#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y));
+
+#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y));
+
+#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y));
+
+#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y));
+
+#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y));
+
+#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y));
+
+#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ VEC_DATA_TYPE(DATA_TYPE, N0) \
+ BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y));
+
+/** @}*/ // end of group LOAD_ELEMENT_n
+
+/** Load Scalar as Vector (consecutive elements).
+ * @name LOAD_SCALAR_AS_VECTOR
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
+ *
+ * @param[in] M0 The number of consecutive rows
+ * @param[in] N0 The number of consecutive columns
+ * @param[in] DATA_TYPE The data type of the target
+ * @param[in] BASENAME The basename of the result variables
+ * @param[in] PTR The base pointer for the data
+ * @param[in] OFFSET The offset within a row
+ * @param[in] STRIDE_Y The stride in y-axis direction
+ * @{
+ */
+#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+ LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+/** @} */ // end of group LOAD_SCALAR_AS_VECTOR
+
+/** Basic macros to calculate Z offset values from Z0 to Zn-1
+ * @name CALCULATE_Z_OFFSET_n
+ *
+ * @param[in] M0 The number of offset values to calculate
+ * @param[in] DATA_TYPE The data type of the results
+ * @param[in] Z The basename of the result variables
+ * @param[in] Y The work-itme ID of y-axis
+ * @param[in] HEIGHT_GEMM3D The height of GEMM3D
+ * @param[in] DEPTH_GEMM3D The depth of GEMM3D
+ * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
+ * @param[in] STRIDE_Y The stride value in y-axis direction
+ *
+ * @{
+ */
+#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y) \
+ Z##0 = (0 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \
+ Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0); \
+ Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y) \
+ CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y) \
+ Z##1 = (1 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \
+ Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1); \
+ Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y) \
+ CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y) \
+ Z##2 = (2 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \
+ Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2); \
+ Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y) \
+ CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y) \
+ Z##3 = (3 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \
+ Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3); \
+ Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y) \
+ CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y) \
+ Z##4 = (4 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \
+ Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4); \
+ Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y) \
+ CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y) \
+ Z##5 = (5 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \
+ Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5); \
+ Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y) \
+ CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y) \
+ Z##6 = (6 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \
+ Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6); \
+ Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y) \
+ CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y) \
+ Z##7 = (7 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \
+ Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7); \
+ Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+/** @} */ // end of group CALCULATE_Z_OFFSET_n
+
+/** Calculate Z offset values from Z0 to Zn-1
+ * @name CALCULATE_Z_OFFSET
+ *
+ * The Z offsets are expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected names of Z offsets are zin1, zin2, zin3.
+ * Note that, CROSS_PLANE_PAD (cross plain padding) is required to take into account
+ * the possible cross plane paddings in case of the plance changes across the z-dimension.
+ *
+ * <!--
+ * | |
+ * | plane0 |
+ * | |
+ * |__________________|
+ * |******************|
+ * | cross_plane_pad |
+ * |******************|
+ * | |
+ * | plane1 |
+ * | |
+ * |__________________|
+ * -->
+ *
+ * @param[in] M0 The number of offset values to calculate
+ * @param[in] DATA_TYPE The data type of the results
+ * @param[in] Z The basename of the result variables
+ * @param[in] Y The work-itme ID of y-axis
+ * @param[in] HEIGHT_GEMM3D The height of GEMM3D
+ * @param[in] DEPTH_GEMM3D The depth of GEMM3D
+ * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
+ * @param[in] STRIDE_Y The stride value in y-axis direction
+ * @{
+ */
+#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y) \
+ CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y)
+#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y) \
+ CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+ STRIDE_Y)
+/** @} */ // end of group CALCULATE_Z_OFFSET
+
+/** Store the 0 to (n-1)th rows of the given variables
+ * @name STORE_ROW_n
+ *
+ * @param[in] N0 The size of the vectors
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME The basename of the variables
+ * @param[in] PTR The base pointer
+ * @param[in] STRIDE_Y The stride value in y-axis direction
+ * @param[in] Z The offset in z-axis direction
+ * @{
+ */
+#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+/** @} */ // end of groupd STORE_ROW_n
+
+/** Convert and store the 0th to (n-1)th rows of the given variables
+ * @name CONVERT_STORE_ROW_n
+ *
+ * @param[in] N0 The size of the vectors
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME The basename of the variables
+ * @param[in] PTR The base pointer
+ * @param[in] STRIDE_Y The stride value in y-axis direction
+ * @param[in] Z The offset in z-axis direction
+ * @{
+ */
+#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \
+ (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \
+ (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \
+ (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \
+ (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \
+ (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \
+ (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \
+ (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \
+ (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \
+ (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \
+ (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \
+ (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \
+ (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \
+ (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \
+ (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \
+ (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ VSTORE(N0) \
+ (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \
+ (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+
+/** @} */ // end of groupd CONVERT_STORE_ROW_n
+
+/** Store a block of the given size M0xN0
+ * @name STORE_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0 The number of rows to store
+ * @param[in] N0 The size of each vector
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME The basename of the variables
+ * @param[in] PTR The base pointer
+ * @param[in] STRIDE_Y The stride value in y-axis direction
+ * @param[in] Z The offset in z-axis direction
+ * @{
+ */
+#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @} */ // end of group STORE_BLOCK
+
+/** Convert and store a block of the given size M0xN0
+ * @name CONVERT_STORE_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0 The number of rows to store
+ * @param[in] N0 The size of each vector
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME The basename of the variables
+ * @param[in] PTR The base pointer
+ * @param[in] STRIDE_Y The stride value in y-axis direction
+ * @param[in] Z The offset in z-axis direction
+ * @{
+ */
+#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+ CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @} */ // end of group CONVERT_STORE_BLOCK
+
+/** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
+ * @name SCALE_ROW_n
+ *
+ * @param[in] DATA_TYPE The data type of the variables
+ * @param[in] BASENAME The basename of the variables
+ * @param[in] SCALE The scale factor
+ * @{
+ */
+#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) BASENAME##0 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
+ SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
+ BASENAME##1 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
+ SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
+ BASENAME##2 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
+ SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
+ BASENAME##3 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
+ SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
+ BASENAME##4 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
+ SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
+ BASENAME##5 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
+ SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
+ BASENAME##6 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
+ SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
+ BASENAME##7 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
+ SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
+ BASENAME##8 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
+ SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
+ BASENAME##9 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
+ SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
+ BASENAME##A *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
+ SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
+ BASENAME##B *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
+ SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
+ BASENAME##C *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
+ SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
+ BASENAME##D *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
+ SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
+ BASENAME##E *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
+ SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
+ BASENAME##F *= (DATA_TYPE)SCALE;
+/** @} */ // end of group SCALE_ROW_n
+
+/** Scale elements stored in a block (BASENAME)
+ * @name SCALE_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16
+ *
+ * @param[in] N The number of rows in the block
+ * @param[in] DATA_TYPE The data type of the block
+ * @param[in] BASENAME The basename of the block
+ * @param[in] SCALE The scale factor
+ * @{
+ */
+#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
+#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
+/** @} */ // end of group SCALE_BLOCK
+
+/** Create a new vector containing the values at the given index for a set of given vectors
+ * @name COLUMN_VECTORn
+ *
+ * @param[in] IDX_COL The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] X The basename of the source vectors
+ * @param[in] TYPE The data type of the destination vectors
+ * @{
+ */
+#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
+ TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
+#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
+ VEC_DATA_TYPE(TYPE, 2) \
+ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
+#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
+ VEC_DATA_TYPE(TYPE, 3) \
+ BASENAME##IDX_COL = \
+ (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
+#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
+ VEC_DATA_TYPE(TYPE, 4) \
+ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, \
+ (X##2).s##IDX_COL, (X##3).s##IDX_COL);
+#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
+ VEC_DATA_TYPE(TYPE, 8) \
+ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))( \
+ (X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, \
+ (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
+#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
+ VEC_DATA_TYPE(TYPE, 16) \
+ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))( \
+ (X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, \
+ (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, \
+ (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, \
+ (X##F).s##IDX_COL);
+/** @} */ // end of group COLUMN_VECTORn
+
+/** Create a new vector containing the values at the given index. Utility macros for transposing a
+ * colum-vector
+ * @name COLUMN_VECTOR_SCALARn
+ *
+ * @param[in] IDX_COL The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] X The basename of the source vectors
+ * @param[in] TYPE The data type of the destination vectors
+ * @{
+ */
+#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0));
+#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
+ VEC_DATA_TYPE(TYPE, 2) \
+ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
+#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \
+ VEC_DATA_TYPE(TYPE, 3) \
+ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2));
+#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \
+ VEC_DATA_TYPE(TYPE, 4) \
+ BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3));
+#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
+ VEC_DATA_TYPE(TYPE, 8) \
+ BASENAME##IDX_COL = \
+ (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
+#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
+ VEC_DATA_TYPE(TYPE, 16) \
+ BASENAME##IDX_COL = \
+ (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), \
+ (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
+/** @} */ // end of group COLUMN_VECTORn
+
+/** Create transposed vectors of the given vectors
+ * @name TRANSPOSE_K0Xn
+ *
+ * @param[in] K0 The size of the source vectors
+ * @param[in] BASENAME The basename of transposed vectors
+ * @param[in] B The basename of source vectors for transposition
+ * @param[in] TYPE The data type of the transposed vectors
+ * @{
+ */
+#define TRANSPOSE_K0X1(K0, BASENAME, B, TYPE) COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X2(K0, BASENAME, B, TYPE) \
+ COLUMN_VECTOR(K0, 0, BASENAME, B, TYPE); \
+ COLUMN_VECTOR(K0, 1, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X3(K0, BASENAME, B, TYPE) \
+ TRANSPOSE_K0X2(K0, BASENAME, B, TYPE); \
+ COLUMN_VECTOR(K0, 2, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X4(K0, BASENAME, B, TYPE) \
+ TRANSPOSE_K0X3(K0, BASENAME, B, TYPE); \
+ COLUMN_VECTOR(K0, 3, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X8(K0, BASENAME, B, TYPE) \
+ TRANSPOSE_K0X4(K0, BASENAME, B, TYPE); \
+ COLUMN_VECTOR(K0, 4, BASENAME, B, TYPE); \
+ COLUMN_VECTOR(K0, 5, BASENAME, B, TYPE); \
+ COLUMN_VECTOR(K0, 6, BASENAME, B, TYPE); \
+ COLUMN_VECTOR(K0, 7, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X16(K0, BASENAME, B, TYPE) \
+ TRANSPOSE_K0X8(K0, BASENAME, B, TYPE); \
+ COLUMN_VECTOR(K0, 8, BASENAME, B, TYPE); \
+ COLUMN_VECTOR(K0, 9, BASENAME, B, TYPE); \
+ COLUMN_VECTOR(K0, A, BASENAME, B, TYPE); \
+ COLUMN_VECTOR(K0, B, BASENAME, B, TYPE); \
+ COLUMN_VECTOR(K0, C, BASENAME, B, TYPE); \
+ COLUMN_VECTOR(K0, D, BASENAME, B, TYPE); \
+ COLUMN_VECTOR(K0, E, BASENAME, B, TYPE); \
+ COLUMN_VECTOR(K0, F, BASENAME, B, TYPE);
+
+/** @} */ // end of group TRANSPOSE_K0Xn
+
+/** Create column vectors to contain the values at the given index for a set of given vectors
+ *
+ * @param[in] K0 The number of source vectors
+ * @param[in] IDX_COL The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] B The basename of the source vectors
+ * @param[in] TYPE The data type of the destination vectors
+ */
+#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, B, TYPE) \
+ CONCAT(COLUMN_VECTOR, K0) \
+ (IDX_COL, BASENAME, B, TYPE);
+
+/** Create column vectors to contain the values at the given index. Utility macro for transposing a
+ * column-vector
+ *
+ * @param[in] K0 The number of source vectors
+ * @param[in] IDX_COL The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] B The basename of the source vectors
+ * @param[in] TYPE The data type of the destination vectors
+ */
+#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, B, TYPE) \
+ CONCAT(COLUMN_VECTOR_SCALAR, K0) \
+ (IDX_COL, BASENAME, B, TYPE);
+
+/** Create transposed vectors form the given source vectors
+ *
+ * @param[in] K0 The size of source vectors
+ * @param[in] N0 The number of source vectors
+ * @param[in] BASENAME The basename of transposed vectors
+ * @param[in] B The basename of source vectors for transposition
+ * @param[in] TYPE The data type of the transposed vectors
+ *
+ */
+#define TRANSPOSE_K0XN0(K0, N0, BASENAME, B, TYPE) \
+ CONCAT(TRANSPOSE_K0X, N0) \
+ (K0, BASENAME, B, TYPE);
+
+/** Add the variables (BIAS0 to BIASn-1) to the others (BASENAME0 to BASENAMEn-1)
+ * @name ADD_ROW_n
+ *
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS The basename of the added variables
+ * @{
+ */
+#define ADD_ROW_1(BASENAME, BIAS) BASENAME##0 += BIAS##0;
+
+#define ADD_ROW_2(BASENAME, BIAS) \
+ ADD_ROW_1(BASENAME, BIAS) \
+ BASENAME##1 += BIAS##1;
+
+#define ADD_ROW_3(BASENAME, BIAS) \
+ ADD_ROW_2(BASENAME, BIAS) \
+ BASENAME##2 += BIAS##2;
+
+#define ADD_ROW_4(BASENAME, BIAS) \
+ ADD_ROW_3(BASENAME, BIAS) \
+ BASENAME##3 += BIAS##3;
+
+#define ADD_ROW_5(BASENAME, BIAS) \
+ ADD_ROW_4(BASENAME, BIAS) \
+ BASENAME##4 += BIAS##4;
+
+#define ADD_ROW_6(BASENAME, BIAS) \
+ ADD_ROW_5(BASENAME, BIAS) \
+ BASENAME##5 += BIAS##5;
+
+#define ADD_ROW_7(BASENAME, BIAS) \
+ ADD_ROW_6(BASENAME, BIAS) \
+ BASENAME##6 += BIAS##6;
+
+#define ADD_ROW_8(BASENAME, BIAS) \
+ ADD_ROW_7(BASENAME, BIAS) \
+ BASENAME##7 += BIAS##7;
+
+#define ADD_ROW_9(BASENAME, BIAS) \
+ ADD_ROW_8(BASENAME, BIAS) \
+ BASENAME##8 += BIAS##8;
+
+#define ADD_ROW_10(BASENAME, BIAS) \
+ ADD_ROW_9(BASENAME, BIAS) \
+ BASENAME##9 += BIAS##9;
+
+#define ADD_ROW_11(BASENAME, BIAS) \
+ ADD_ROW_10(BASENAME, BIAS) \
+ BASENAME##A += BIAS##A;
+
+#define ADD_ROW_12(BASENAME, BIAS) \
+ ADD_ROW_11(BASENAME, BIAS) \
+ BASENAME##B += BIAS##B;
+
+#define ADD_ROW_13(BASENAME, BIAS) \
+ ADD_ROW_12(BASENAME, BIAS) \
+ BASENAME##C += BIAS##C;
+
+#define ADD_ROW_14(BASENAME, BIAS) \
+ ADD_ROW_13(BASENAME, BIAS) \
+ BASENAME##D += BIAS##D;
+
+#define ADD_ROW_15(BASENAME, BIAS) \
+ ADD_ROW_14(BASENAME, BIAS) \
+ BASENAME##E += BIAS##E;
+
+#define ADD_ROW_16(BASENAME, BIAS) \
+ ADD_ROW_15(BASENAME, BIAS) \
+ BASENAME##F += BIAS##F;
+
+/** @} */ // end of group ADD_ROW_n
+
+/** Add the block (BIAS) to another block (BASENAME)
+ * @name ADD_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16
+ *
+ * @param[in] N The number of vectors in the block
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS The basename of the added variables
+ * @{
+ */
+#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
+#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
+/** @} */ // end of group ADD_BLOCK
+
+/** Broadcast (add single value) to the each element of the destination variables
+ * @name ADD_ROW_BROADCAST_n
+ *
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS The variable containing the value to add
+ * @{
+ */
+#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) BASENAME##0 += BIAS;
+
+#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
+ ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
+ BASENAME##1 += BIAS;
+
+#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
+ ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
+ BASENAME##2 += BIAS;
+
+#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
+ ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
+ BASENAME##3 += BIAS;
+
+#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
+ ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
+ BASENAME##4 += BIAS;
+
+#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
+ ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
+ BASENAME##5 += BIAS;
+
+#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
+ ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
+ BASENAME##6 += BIAS;
+
+#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
+ ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
+ BASENAME##7 += BIAS;
+
+#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
+ ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
+ BASENAME##8 += BIAS;
+
+#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
+ ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
+ BASENAME##9 += BIAS;
+
+#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
+ ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
+ BASENAME##A += BIAS;
+
+#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
+ ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
+ BASENAME##B += BIAS;
+
+#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
+ ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
+ BASENAME##C += BIAS;
+
+#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
+ ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
+ BASENAME##D += BIAS;
+
+#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
+ ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
+ BASENAME##E += BIAS;
+
+#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
+ ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
+ BASENAME##F += BIAS;
+
+/** Broadcast (add a value) to the each element of the destination block (BASENAME)
+ * @name ADD_BLOCK_BROADCAST
+ *
+ * Supported cases are N=1,2,3,...,16.
+ *
+ * @param[in] N The number of vectors in the block
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS The variable containing the value to add
+ * @{
+ */
+#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
+#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
+/** @} */ // end of group ADD_BLOCK_BROADCAST
+
+/** Apply activation to the given variables
+ * @name ACTIVATION_ROW_n
+ *
+ * @param[in] ACTIVATION_TYPE The type of the activation
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME The basename of the variables
+ * @param[in] A_VAL Additional value required by the activation
+ * @param[in] B_VAL Additional value required by the activation
+ * @{
+ */
+#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##0, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##1, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##2, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##3, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##4, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##5, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##6, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##7, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##8, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##9, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##A, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##B, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##C, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##D, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##E, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##F, A_VAL, B_VAL);
+/** @} */ // end of group ACTIVATION_ROW_n
+
+/** Apply activation to a block (BASENAME)
+ * @name ACTIVATION_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16.
+ *
+ * @param[in] N The number of vectors in the block
+ * @param[in] ACTIVATION_TYPE The type of the activation
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME The basename of the variables
+ * @param[in] A_VAL Additional value required by the activation
+ * @param[in] B_VAL Additional value required by the activation
+ * @{
+ */
+#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+ ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
+/** @} */ // end of group ACTIVATION_BLOCK
+
+/** Apply convert_<data_type> to the given variables
+ * @name CONVERT_ROW_n
+ *
+ * @param[in] N The size of the vectors
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME_SRC The basename of the source variables
+ * @param[in] BASENAME_DST The basename of the destination variables
+ */
+#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ VEC_DATA_TYPE(DATA_TYPE, N) \
+ BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ VEC_DATA_TYPE(DATA_TYPE, N) \
+ BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ VEC_DATA_TYPE(DATA_TYPE, N) \
+ BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ VEC_DATA_TYPE(DATA_TYPE, N) \
+ BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ VEC_DATA_TYPE(DATA_TYPE, N) \
+ BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ VEC_DATA_TYPE(DATA_TYPE, N) \
+ BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ VEC_DATA_TYPE(DATA_TYPE, N) \
+ BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ VEC_DATA_TYPE(DATA_TYPE, N) \
+ BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ VEC_DATA_TYPE(DATA_TYPE, N) \
+ BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ VEC_DATA_TYPE(DATA_TYPE, N) \
+ BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ VEC_DATA_TYPE(DATA_TYPE, N) \
+ BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ VEC_DATA_TYPE(DATA_TYPE, N) \
+ BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ VEC_DATA_TYPE(DATA_TYPE, N) \
+ BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ VEC_DATA_TYPE(DATA_TYPE, N) \
+ BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ VEC_DATA_TYPE(DATA_TYPE, N) \
+ BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ VEC_DATA_TYPE(DATA_TYPE, N) \
+ BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
+/** @} */ // end of group CONVERT_ROW_n
+
+/** Apply convert_<data_type> to a block (BASENAME_SRC) and save to another block (BASENAME_DST)
+ * @name CONVERT_BLOCK
+ *
+ * Supported cases N=1,2,3,...,16.
+ *
+ * @param[in] M The number of vectors to convert
+ * @param[in] N The size of the vectors
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME_SRC The basename of the source variables
+ * @param[in] BASENAME_DST The basename of the destination variables
+ */
+#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+ CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+/** @} */ // end of group CONVERT_BLOCK
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl
new file mode 100644
index 000000000..c19766c9f
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl
@@ -0,0 +1,2733 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "helpers_asymm.h"
+#include "repeat.h"
+
+#if defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
+ defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
+ // defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
+ // defined(cl_arm_integer_dot_product_accumulate_int8)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform the dot product instruction between two vectors of size N [1,16].
+ * These macros use the dot8 instruction */
+#define ARM_DOT1(a, b, c) \
+ ({ \
+ ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0), \
+ (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \
+ })
+#define ARM_DOT2(a, b, c) \
+ ({ \
+ ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0), \
+ (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \
+ })
+#define ARM_DOT3(a, b, c) \
+ ({ \
+ ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0), \
+ (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \
+ })
+#define ARM_DOT4(a, b, c) ({ ARM_DOT(a, b, c); })
+#define ARM_DOT8(a, b, c) \
+ ({ \
+ ARM_DOT4((a.lo), (b.lo), c); \
+ ARM_DOT4((a.hi), (b.hi), c); \
+ })
+#define ARM_DOT16(a, b, c) \
+ ({ \
+ ARM_DOT8((a.lo), (b.lo), c); \
+ ARM_DOT8((a.hi), (b.hi), c); \
+ })
+
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform the dot product instruction between two vectors of size K0 [1,16]
+ * without using the dot8 instruction. */
+#define ARM_DOT1(a, b, c) ({ c += (ACC_DATA_TYPE)a * b; })
+#define ARM_DOT2(a, b, c) \
+ ({ \
+ c += (ACC_DATA_TYPE)a.s0 * b.s0; \
+ c += (ACC_DATA_TYPE)a.s1 * b.s1; \
+ })
+#define ARM_DOT3(a, b, c) \
+ ({ \
+ ARM_DOT2(a, b, c); \
+ c += (ACC_DATA_TYPE)a.s2 * b.s2; \
+ })
+#define ARM_DOT4(a, b, c) \
+ ({ \
+ ARM_DOT3(a, b, c); \
+ c += (ACC_DATA_TYPE)a.s3 * b.s3; \
+ })
+#define ARM_DOT8(a, b, c) \
+ ({ \
+ ARM_DOT4((a.lo), (b.lo), c); \
+ ARM_DOT4((a.hi), (b.hi), c); \
+ })
+#define ARM_DOT16(a, b, c) \
+ ({ \
+ ARM_DOT8((a.lo), (b.lo), c); \
+ ARM_DOT8((a.hi), (b.hi), c); \
+ })
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0
+ * vectors "b" of size K0 [1,16] */
+#define ARM_DOT_K0X1(k0, a, b, c) ({ ARM_DOT_K0(k0, (a), (b##0), (c)); })
+#define ARM_DOT_K0X2(k0, a, b, c) \
+ ({ \
+ ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \
+ ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \
+ })
+#define ARM_DOT_K0X3(k0, a, b, c) \
+ ({ \
+ ARM_DOT_K0X2(k0, a, b, c); \
+ ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \
+ })
+#define ARM_DOT_K0X4(k0, a, b, c) \
+ ({ \
+ ARM_DOT_K0X3(k0, a, b, c); \
+ ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \
+ })
+#define ARM_DOT_K0X8(k0, a, b, c) \
+ ({ \
+ ARM_DOT_K0X4(k0, a, b, c); \
+ ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \
+ ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \
+ ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \
+ ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \
+ })
+#define ARM_DOT_K0X16(k0, a, b, c) \
+ ({ \
+ ARM_DOT_K0X8(k0, a, b, c); \
+ ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \
+ ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \
+ ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \
+ ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \
+ ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \
+ ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \
+ ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \
+ ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \
+ })
+
+/** Specialized macros to perform a partial matrix multiplication with dimensions M0,N0,K0 */
+#define ARM_MM_K0XN0X1(n0, k0, a, b, c) ({ ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); })
+#define ARM_MM_K0XN0X2(n0, k0, a, b, c) \
+ ({ \
+ ARM_MM_K0XN0X1(n0, k0, a, b, c); \
+ ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \
+ })
+#define ARM_MM_K0XN0X3(n0, k0, a, b, c) \
+ ({ \
+ ARM_MM_K0XN0X2(n0, k0, a, b, c); \
+ ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \
+ })
+#define ARM_MM_K0XN0X4(n0, k0, a, b, c) \
+ ({ \
+ ARM_MM_K0XN0X3(n0, k0, a, b, c); \
+ ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \
+ })
+#define ARM_MM_K0XN0X5(n0, k0, a, b, c) \
+ ({ \
+ ARM_MM_K0XN0X4(n0, k0, a, b, c); \
+ ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \
+ })
+#define ARM_MM_K0XN0X6(n0, k0, a, b, c) \
+ ({ \
+ ARM_MM_K0XN0X5(n0, k0, a, b, c); \
+ ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \
+ })
+#define ARM_MM_K0XN0X7(n0, k0, a, b, c) \
+ ({ \
+ ARM_MM_K0XN0X6(n0, k0, a, b, c); \
+ ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \
+ })
+#define ARM_MM_K0XN0X8(n0, k0, a, b, c) \
+ ({ \
+ ARM_MM_K0XN0X7(n0, k0, a, b, c); \
+ ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \
+ })
+
+#define ARM_DOT_K0(k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT, k0) \
+ ((a), (b), (c)); \
+ })
+
+#define ARM_DOT_K0XN0(n0, k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_DOT_K0X, n0) \
+ (k0, (a), b, (c)); \
+ })
+
+#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_MM_K0XN0X, m0) \
+ (n0, k0, a, b, c); \
+ })
+
+/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0
+ * vectors "b" of size K0 [1,16] */
+#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c) ({ c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; })
+#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c) \
+ ({ \
+ c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \
+ c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \
+ })
+#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c) \
+ ({ \
+ ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c); \
+ c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \
+ })
+#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c) \
+ ({ \
+ ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c); \
+ c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \
+ })
+#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c) \
+ ({ \
+ ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c); \
+ c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \
+ c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \
+ c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \
+ c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \
+ })
+#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c) \
+ ({ \
+ ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c); \
+ c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \
+ c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \
+ c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \
+ c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \
+ c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \
+ c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \
+ c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \
+ c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \
+ })
+/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */
+#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); })
+#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ \
+ ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c); \
+ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1)); \
+ })
+#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ \
+ ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c); \
+ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2)); \
+ })
+#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ \
+ ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c); \
+ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3)); \
+ })
+#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ \
+ ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c); \
+ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4)); \
+ })
+#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ \
+ ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c); \
+ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5)); \
+ })
+#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ \
+ ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c); \
+ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6)); \
+ })
+#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ \
+ ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c); \
+ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7)); \
+ })
+#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_MUL_N0X, k0) \
+ (VECTOR_ACC_TYPE, (a), b, (c)); \
+ })
+#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \
+ ({ \
+ CONCAT(ARM_MM_NATIVE_N0XK0X, m0) \
+ (VECTOR_ACC_TYPE, k0, a, b, c); \
+ })
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && \
+ defined(N)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with
+ * QASYMM/QASYMM_SIGNED data type. The LHS matrix must be reshaped with @ref
+ * CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed The RHS matrix must be reshaped
+ * with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e.
+ * -DACC_DATA_TYPE=uint)
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52
+ * and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0)
+ * must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS
+ * matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option
+ * -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 2, 3, 4, 5, 6, 7, 8
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16
+ * - V0 >= 1
+ * - H0 >= 1
+ *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution
+ * layer), the following information must be passed at compile time:
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data
+ * type: QASYMM8/QASYMM_SIGNED
+ * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in] lhs_step_x src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in] lhs_step_y src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped
+ * matrix
+ * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in] rhs_step_x src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in] rhs_step_y src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped
+ * matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * type: S32
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] k Number of columns in LHS matrix and rows in RHS
+ * matrix not reshaped.
+ * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+ IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z,
+ uint rhs_stride_z, uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+ // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+ // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+ if ((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+ // Compute LHS matrix address
+ __global DATA_TYPE *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes +
+ (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y +
+ (z * lhs_stride_z);
+
+ // Compute RHS matrix address
+ __global DATA_TYPE *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes +
+ (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ rhs_addr += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+ REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c,
+ 0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+ for (int i = 0; i < k; i += K0)
+ {
+ // Load values from LHS matrix
+ LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs);
+
+ // Load values from RHS matrix
+ LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X, zrhs);
+
+ // Partial matrix multiplication M0,N0,K0
+ ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+
+ // Update address
+ lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
+ rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
+ }
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+ (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+ dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Convert and store output block
+ CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ * The LHS matrix is NOT reshaped
+ * The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is
+ * transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e.
+ * -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at
+ * compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16
+ * - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data
+ * type: QASYMM8/QASYMM8_SIGNED
+ * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in] lhs_step_x src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in] lhs_step_y src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped
+ * matrix
+ * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in] rhs_step_x src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in] rhs_step_y src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped
+ * matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * type: S32
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in
+ * unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+ IMAGE_DECLARATION(dst), uint lhs_stride_z,
+ uint rhs_stride_z, uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+ // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+ if ((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+ // Compute LHS matrix address
+ uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+ // Compute RHS matrix address
+ uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X +
+ (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+ REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+ lhs_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply lhs_stride_z by DEPTH_GEMM3D
+ lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c,
+ 0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+ for (int i = 0; i < K; i += K0)
+ {
+ // Load values from LHS matrix
+ LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+ // Load values from RHS matrix
+ LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
+
+ // Partial matrix multiplication M0,N0,K0
+ ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+
+ lhs_offset += K0;
+ rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+ }
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+ (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+ dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Convert and store output block
+ CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#if defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with fused output stage
+ * using fixed-point arithmetic. The LHS matrix is NOT reshaped The RHS matrix is reshaped with @ref
+ * CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e.
+ * -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at
+ * compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16
+ * - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @note The offset, scalar scale factor and number of bits to shift right of output tensor must be
+ * passed at compile time using -DRESULT_OFFSET, -RESULT_MULTIPLIER and -DRESULT_SHIFT
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ * @note In case of per-channel quantization of matrix B, -DPER_CHANNEL_QUANTIZATION must be passed
+ * at compile time.
+ *
+ * @param[in] lhs_ptr Pointer to the LHS reshaped matrix.
+ * Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in
+ * X dimension (in bytes)
+ * @param[in] lhs_step_x src_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in
+ * Y dimension (in bytes)
+ * @param[in] lhs_step_y src_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in
+ * the LHS reshaped matrix
+ * @param[in] rhs_ptr Pointer to the RHS reshaped matrix.
+ * Supported data type: same as @p lhs_ptr
+ * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in
+ * X dimension (in bytes)
+ * @param[in] rhs_step_x src_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in
+ * Y dimension (in bytes)
+ * @param[in] rhs_step_y src_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in
+ * the RHS reshaped matrix
+ * @param[out] dst_ptr Pointer to the destination matrix
+ * Supported data type: same as @p lhs_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in
+ * X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in
+ * Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in
+ * the destination matrix
+ * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in
+ * Z dimension (in bytes)
+ * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in
+ * Z dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in
+ * Z dimension (in bytes)
+ * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS
+ * matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the
+ * output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in] sum_col_ptr (Optional) Pointer to the source
+ * tensor. Supported data type: S32
+ * @param[in] sum_col_stride_x (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in] sum_row_ptr (Optional) Pointer to the source
+ * tensor. Supported data type: S32
+ * @param[in] sum_row_stride_x (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in] biases_ptr (Optional) Pointer to the biases
+ * tensor. Supported data type: S32
+ * @param[in] biases_stride_x (Optional) Stride of the biases
+ * tensor in X dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the biases tensor
+ * @param[in] result_multipliers_ptr (Optional) Pointer to the output
+ * multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in] result_multipliers_stride_x (Optional) Stride of the output
+ * multipliers vector in X dimension (in bytes)
+ * @param[in] result_multipliers_step_x (Optional)
+ * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the output multipliers vector
+ * @param[in] result_shifts_ptr (Optional) Pointer to the output
+ * shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in] result_shifts_stride_x (Optional) Stride of the output
+ * shifts vector in X dimension (in bytes)
+ * @param[in] result_shifts_step_x (Optional) output_shifts_stride_x *
+ * number of elements along X processed per workitem(in bytes)
+ * @param[in] result_shifts_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the output shifts vector
+ */
+__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(
+ IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), IMAGE_DECLARATION(dst), uint lhs_stride_z,
+ uint rhs_stride_z, uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+#if defined(A_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+ ,
+ VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS)
+#if defined(PER_CHANNEL_QUANTIZATION)
+ ,
+ VECTOR_DECLARATION(result_multipliers), VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+)
+{
+ // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+ // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+ if ((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+ // Compute LHS matrix address
+ uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+ // Compute RHS matrix address
+ uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X +
+ (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+ REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+ lhs_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply lhs_stride_z by DEPTH_GEMM3D
+ lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c,
+ 0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+ for (int i = 0; i < K; i += K0)
+ {
+ // Load values from LHS matrix
+ LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+ // Load values from RHS matrix
+ LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
+
+ // Partial matrix multiplication M0,N0,K0
+ ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+
+ lhs_offset += K0;
+ rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+ }
+
+ // Result of MM is of type DATA_TYPE
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+ (x * (uint)N0) * sizeof(DATA_TYPE) + (y * (uint)M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+ dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Convert result of matrix multiplication to S32
+ REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_int);
+
+ int batch_id = z;
+#if defined(DEPTH_GEMM3D)
+ batch_id /= (int)DEPTH_GEMM3D;
+#endif // defined(DEPTH_GEMM3D)
+
+ // Offset contribution: c += (A_OFFSET * sum_col) + (B_OFFSET * sum_row) + K_OFFSET;
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(int, N0), offset_s32_, K_OFFSET);
+
+#if defined(A_OFFSET)
+ // Compute the offset contribution due to A_OFFSET
+ __global uchar *sum_col_addr =
+ sum_col_ptr + sum_col_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
+
+#if defined(SUM_COL_HAS_BATCHES)
+ sum_col_addr += z * sum_col_stride_y;
+#endif // defined(SUM_COL_HAS_BATCHES)
+ VEC_DATA_TYPE(int, N0)
+ a_offset_s32 = VLOAD(N0)(0, (__global int *)sum_col_addr);
+ a_offset_s32 *= (VEC_DATA_TYPE(int, N0))A_OFFSET;
+
+ REPEAT_ADD_VECTOR_TO_VAR(M0, offset_s32_, a_offset_s32);
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+ // Compute the offset contribution due to B_OFFSET
+ __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes +
+ (y * (uint)M0) * sizeof(int) + z * sum_row_stride_y;
+
+#if defined(HEIGHT_GEMM3D) && defined(DEPTH_GEMM3D)
+ sum_row_addr += (batch_id % (int)DEPTH_GEMM3D) * (int)HEIGHT_GEMM3D * sizeof(int);
+#endif // defined(HEIGHT_GEMM3D) && defined(DEPTH_GEMM3D)
+ LOAD_SCALAR_AS_VECTOR(M0, N0, int, b_offset_s32_, sum_row_addr, 0, sum_row_stride_x);
+
+ REPEAT_MLA_VAR_WITH_CONST_VEC(M0, offset_s32_, b_offset_s32_, (VEC_DATA_TYPE(int, N0))B_OFFSET);
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+ // Add bias
+ __global uchar *bias_addr =
+ biases_ptr + biases_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
+
+ VEC_DATA_TYPE(int, N0)
+ bias_values = VLOAD(N0)(0, (__global int *)bias_addr);
+ REPEAT_ADD_VECTOR_TO_VAR(M0, offset_s32_, bias_values);
+#endif // defined(ADD_BIAS)
+
+ REPEAT_ADD_TWO_VARS(M0, c_int, offset_s32_);
+
+ // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+ __global uchar *result_multipliers_addr = result_multipliers_ptr +
+ result_multipliers_offset_first_element_in_bytes +
+ (x * (uint)N0) * sizeof(int);
+ __global uchar *result_shifts_addr =
+ result_shifts_ptr + result_shifts_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
+
+ VEC_DATA_TYPE(int, N0)
+ res_mul = VLOAD(N0)(0, (__global int *)result_multipliers_addr);
+ VEC_DATA_TYPE(int, N0)
+ res_shift = VLOAD(N0)(0, (__global int *)result_shifts_addr);
+
+ REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(M0, N0, c_int, res_mul, res_shift);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+
+#if RESULT_SHIFT < 0
+ REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(M0, N0, c_int, RESULT_MULTIPLIER,
+ RESULT_SHIFT);
+#else // RESULT_SHIFT >= 0
+ REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(M0, N0, c_int, RESULT_MULTIPLIER,
+ RESULT_SHIFT);
+#endif // RESULT_SHIFT < 0
+
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+ // Add the offset terms to GEMM's result
+ REPEAT_ADD_CONST_TO_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, RESULT_OFFSET);
+
+#if defined(MIN_BOUND)
+ REPEAT_MAX_CONST_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+ REPEAT_MIN_CONST_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+ // Convert and store output block (does convert saturate)
+ CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c_int, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) &&
+ // defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(K)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ * The LHS matrix is NOT reshaped
+ * The RHS matrix is NOT reshaped
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e.
+ * -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of N0 columns to process must be passed at compile time using -DN0 (i.e. -DN0=2)
+ * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (i.e.,
+ * -DK0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ * - N0 = 2, 3, 4, 8, 16
+ * - K0 = 2, 3, 4, 8, 16
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data
+ * type: QASYMM8
+ * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in] lhs_step_x src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in] lhs_step_y src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped
+ * matrix
+ * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in] rhs_step_x src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in] rhs_step_y src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped
+ * matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data
+ * type: S32
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in
+ * unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+ IMAGE_DECLARATION(dst), uint lhs_stride_z, uint rhs_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+ uint x = get_global_id(0);
+ uint y = get_global_id(1);
+ uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+ if ((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+ // Compute LHS matrix address
+ uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+ // Compute RHS matrix address
+ uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
+ REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+ // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+ lhs_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply lhs_stride_z by DEPTH_GEMM3D
+ lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+ // Initialize the accumulators
+ REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c,
+ 0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+ int i = 0;
+
+ for (; i <= (K - K0); i += K0)
+ {
+ // Load values from LHS matrix
+ LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+ // Load values from RHS matrix
+ LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+
+ // Partial matrix multiplication M0,N0,K0
+#if (GPU_ARCH == GPU_ARCH_MIDGARD)
+ ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c);
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+ // Transpose the values from RHS matrix
+ TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE);
+
+ ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+ // Update the offset
+ lhs_offset += K0;
+ rhs_offset += K0 * rhs_stride_y;
+ }
+
+ // Left-over for loop
+ for (; i < K; ++i)
+ {
+ // Load values from LHS matrix
+ LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+ // Load values from RHS matrix
+ LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+
+ // Partial matrix multiplication M0,N0,1
+#if (GPU_ARCH == GPU_ARCH_MIDGARD)
+ ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c);
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+ // Transpose the values from RHS matrix
+ TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE);
+
+ ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+ // Update the offset
+ lhs_offset += 1;
+ rhs_offset += rhs_stride_y;
+ }
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+ (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
+
+ REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+ CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+ dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+ // Convert and store output block
+ CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(K)
+
+#if defined(COLS_A)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix
+ * A. It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at
+ * compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE
+ * (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g.
+ * -DSCALAR=3)
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data type:
+ * QASYMM8/QASYMM8_SIGNED
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[out] dst_ptr Pointer to the destination tensor Supported data
+ * type: S32
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in
+ * bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in
+ * bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst))
+{
+ // Compute source and destination addresses
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+ sum_row_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))0;
+ ACC_DATA_TYPE sum_row = 0;
+
+ __global const DATA_TYPE *matrix_a =
+ (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y +
+ get_global_id(1) * src_stride_z);
+
+ int i = 0;
+
+ // This for loop performs 16 accumulations
+ for (; i <= ((int)COLS_A - 16); i += 16)
+ {
+ const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i);
+
+ sum_row_32 += CONVERT(a0.s0123, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) +
+ CONVERT(a0.s4567, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) +
+ CONVERT(a0.s89AB, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) +
+ CONVERT(a0.sCDEF, VEC_DATA_TYPE(ACC_DATA_TYPE, 4));
+ }
+
+ // This for loop performs the leftover accumulations
+ for (; i < COLS_A; ++i)
+ {
+ sum_row += (ACC_DATA_TYPE)matrix_a[i];
+ }
+
+ sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3;
+
+#if defined(SCALAR)
+ sum_row *= (int)SCALAR;
+#endif // defined(SCALAR)
+ *((__global int *)dst.ptr) = (int)sum_row;
+}
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A
+ * using the arm dot product instruction. It is also possible to multiply each reduced row by a
+ * scalar value, if SCALAR is passed at compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE
+ * (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g.
+ * -DSCALAR=3)
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data type:
+ * QASYMM8/QASYMM8_SIGNED
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[out] dst_ptr Pointer to the destination tensor Supported data
+ * type: S32
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in
+ * bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in
+ * bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst))
+{
+ // Compute source and destination addresses
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ ACC_DATA_TYPE sum_row = 0;
+
+ __global const DATA_TYPE *matrix_a =
+ (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y +
+ get_global_id(1) * src_stride_z);
+
+ int i = 0;
+
+ // This for loop performs 16 accumulations
+ for (; i <= ((int)COLS_A - 32); i += 32)
+ {
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ a0 = vload16(0, matrix_a + i);
+
+ sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+ sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+ sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+ sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+
+ a0 = vload16(1, matrix_a + i);
+
+ sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+ sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+ sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+ sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+ }
+
+ // This for loop performs the leftover accumulations
+ for (; i < COLS_A; ++i)
+ {
+ sum_row += (ACC_DATA_TYPE)matrix_a[i];
+ }
+
+#if defined(SCALAR)
+ sum_row *= (int)SCALAR;
+#endif // defined(SCALAR)
+ *((__global int *)dst.ptr) = (int)sum_row;
+}
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#endif // defined(COLS_A)
+
+#if defined(COLS_B) && defined(ROWS_B)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of
+ * Matrix B. It is also possible to multiply each reduced column by a scalar value, if SCALAR is
+ * passed at compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix B columns and rows needs to be passed at compile time using
+ * -DCOLS_B and -DROWS_B
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE
+ * (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (i.e.
+ * -DSCALAR=3)
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data type:
+ * QASYMM8/QASYMM8_SIGNED
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[out] dst_ptr Pointer to the destination tensor Supported data
+ * type: S32
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in
+ * bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in
+ * bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * tensor
+ */
+__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst))
+{
+ // Compute source and destination addresses
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
+ sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 16))0;
+
+ __global const DATA_TYPE *matrix_b =
+ (__global const DATA_TYPE *)(src.ptr + get_global_id(1) * src_stride_z);
+
+ int i = 0;
+ // This for loop performs 4 accumulations
+ for (; i <= ((int)ROWS_B - 4); i += 4)
+ {
+ const VEC_DATA_TYPE(DATA_TYPE, 16) b0 = vload16(0, matrix_b + 0 * src_stride_y);
+ const VEC_DATA_TYPE(DATA_TYPE, 16) b1 = vload16(0, matrix_b + 1 * src_stride_y);
+ const VEC_DATA_TYPE(DATA_TYPE, 16) b2 = vload16(0, matrix_b + 2 * src_stride_y);
+ const VEC_DATA_TYPE(DATA_TYPE, 16) b3 = vload16(0, matrix_b + 3 * src_stride_y);
+
+ sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) +
+ CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) +
+ CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) +
+ CONVERT(b3, VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+
+ matrix_b += 4 * src_stride_y;
+ }
+
+ // This for loop perfoms the leftover accumulations
+ for (; i < (int)ROWS_B; ++i)
+ {
+ const VEC_DATA_TYPE(DATA_TYPE, 16) b0 = vload16(0, matrix_b);
+
+ sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+
+ matrix_b += src_stride_y;
+ }
+
+#if defined(SCALAR)
+ sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, 16))SCALAR;
+#endif // defined(SCALAR)
+ VSTORE(16)
+ (convert_int16(sum_col_32), 0, (__global int *)dst.ptr);
+}
+#endif // defined(COLS_B) && defined(ROWS_B)
+
+#endif // defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
+
+#if defined(K_OFFSET)
+
+/* Helper function used to calculate the offset contribution after matrix multiplication.
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
+ * and calculates the offset contribution of matrix A and matrix B.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns)
+ * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at
+ * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at
+ * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually
+ * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * @param[in] x get_global_id(0) * 4
+ * @param[in] y get_global_id(1)
+ * @param[in] z get_global_id(2)
+ * @param[in] sum_col_ptr (Optional) Pointer to the source tensor.
+ * Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the source tensor
+ * @param[in] sum_row_ptr (Optional) Pointer to the source tensor.
+ * Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the source tensor
+ * @param[in] biases_ptr (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the biases tensor
+ */
+inline int4 offset_contribution(int x, int y, int z
+#if defined(A_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+ ,
+ VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS)
+)
+{
+ int4 a_offset_s32 = (int4)0;
+ int4 b_offset_s32 = (int4)0;
+
+ int batch_id = z;
+#if defined(DEPTH_INPUT3D)
+ batch_id /= (int)DEPTH_INPUT3D;
+#endif // defined(DEPTH_INPUT3D)
+
+#if defined(A_OFFSET)
+ // Compute the offset contribution due to A_OFFSET
+ __global uchar *sum_col_addr =
+ sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
+
+ // Compute the offset contribution due to A_OFFSET
+#if defined(SUM_COL_HAS_BATCHES)
+ a_offset_s32 = vload4(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
+#else // defined(SUM_COL_HAS_BATCHES)
+ a_offset_s32 = vload4(0, (__global int *)sum_col_addr);
+#endif // defined(SUM_COL_HAS_BATCHES)
+
+ a_offset_s32 *= (int4)A_OFFSET;
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+ // Compute the offset contribution due to A_OFFSET
+ __global uchar *sum_row_addr =
+ sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
+
+ // Compute the offset contribution due to B_OFFSET
+#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+ b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) +
+ (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
+#else // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+ b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
+#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+ b_offset_s32 *= (int4)B_OFFSET;
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+ // Add bias
+ __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+ int4 biases_values = vload4(0, (__global int *)bias_addr);
+ b_offset_s32 += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+ return (int4)K_OFFSET + a_offset_s32 + b_offset_s32;
+}
+
+/* OpenCL kernel used to add the offset contribution after matrix multiplication. The computation is
+ * performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns)
+ * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at
+ * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at
+ * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually
+ * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ * (sum_col[k] * A_OFFSET) +
+ * (sum_row[i] * B_OFFSET) +
+ * (K_OFFSET)
+ *
+ * @param[in] mm_result_ptr Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] mm_result_step_x mm_result_stride_x * number of elements along
+ * X processed per workitem(in bytes)
+ * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] mm_result_step_y mm_result_stride_y * number of elements along
+ * Y processed per workitem(in bytes)
+ * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] mm_result_step_z mm_result_stride_z * number of elements along
+ * Z processed per workitem(in bytes)
+ * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[in] sum_col_ptr (Optional) Pointer to the source tensor.
+ * Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the source tensor
+ * @param[in] sum_row_ptr (Optional) Pointer to the source tensor.
+ * Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the source tensor
+ * @param[in] biases_ptr (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the biases tensor
+ */
+__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+ ,
+ VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS))
+)
+{
+ const int x = get_global_id(0) * 4;
+ const int y = get_global_id(1);
+ const int z = get_global_id(2);
+
+ // Compute offset contribution
+ int4 offset_term_s32 = offset_contribution(
+ x, y, z
+#if defined(A_OFFSET)
+ ,
+ sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y,
+ sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+ ,
+ sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y,
+ sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+ ,
+ biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+ );
+
+ __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes +
+ x * sizeof(int) + y * mm_result_stride_y +
+ z * mm_result_stride_z;
+
+ int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+ // Add the offset terms to GEMM's result
+ in_s32 += offset_term_s32;
+
+ // Store the result with the offset contribution
+ vstore4(in_s32, 0, (__global int *)mm_result_addr);
+}
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && \
+ defined(OUTPUT_DATA_TYPE)
+/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and
+ * it quantizes down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of
+ * @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and
+ * quantizes to uint8 through the output stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns)
+ * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at
+ * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at
+ * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually
+ * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ * (sum_col[k] * A_OFFSET) +
+ * (sum_row[i] * B_OFFSET) +
+ * (K_OFFSET)
+ *
+ * This result is quantized down to uint8/int8 using the output stage. The output stage computes the
+ * following operations:
+ *
+ * -# Add offset terms to final result
+ * -# Multiply each entry of result by result_mult_int
+ * -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ * -# Shift the int32 accumulator by result_shift
+ * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND
+ * are passed at compile time)
+ * -# Clamp the resulting int32 values:
+ * - to the [0..255] range and cast to QASYMM8.
+ * - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in] mm_result_ptr Pointer to the source tensor.
+ * Supported data type: S32
+ * @param[in] mm_result_stride_x Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] mm_result_step_x mm_result_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in] mm_result_stride_y Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] mm_result_step_y mm_result_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in] mm_result_stride_z Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in] mm_result_step_z mm_result_stride_z * number of
+ * elements along Z processed per workitem(in bytes)
+ * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in
+ * the source tensor
+ * @param[in] sum_col_ptr (Optional) Pointer to the source
+ * tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in] sum_row_ptr (Optional) Pointer to the source
+ * tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in] biases_ptr (Optional) Pointer to the biases
+ * tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x (Optional) Stride of the biases
+ * tensor in X dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the biases tensor
+ * @param[out] dst_ptr Pointer to the destination tensor
+ * Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in] dst_stride_x Stride of the destination tensor in
+ * X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in
+ * Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in] dst_step_z src_stride_z * number of elements
+ * along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in
+ * the destination tensor
+ * @param[in] result_multipliers_ptr (Optional) Pointer to the output
+ * multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in] result_multipliers_stride_x (Optional) Stride of the output
+ * multipliers vector in X dimension (in bytes)
+ * @param[in] result_multipliers_step_x (Optional)
+ * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the output multipliers vector
+ * @param[in] result_shifts_ptr (Optional) Pointer to the output
+ * shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in] result_shifts_stride_x (Optional) Stride of the output
+ * shifts vector in X dimension (in bytes)
+ * @param[in] result_shifts_step_x (Optional) output_shifts_stride_x *
+ * number of elements along X processed per workitem(in bytes)
+ * @param[in] result_shifts_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the output shifts vector
+ */
+__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+ ,
+#if defined(ADD_BIAS)
+ VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+ TENSOR3D_DECLARATION(dst)
+#if defined(PER_CHANNEL_QUANTIZATION)
+ ,
+ VECTOR_DECLARATION(result_multipliers),
+ VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+)
+{
+ const int x = get_global_id(0) * 4;
+ const int y = get_global_id(1);
+ const int z = get_global_id(2);
+
+ __global uchar *dst_addr =
+ dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+ // Compute offset contribution
+ int4 offset_term_s32 = offset_contribution(
+ x, y, z
+#if defined(A_OFFSET)
+ ,
+ sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y,
+ sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+ ,
+ sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y,
+ sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+ ,
+ biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+ );
+
+ __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes +
+ x * sizeof(int) + y * mm_result_stride_y +
+ z * mm_result_stride_z;
+
+ int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+ // Add the offset terms to GEMM's result
+ in_s32 += offset_term_s32;
+
+ // -------------- OUTPUT STAGE
+
+ // Add the offset terms to GEMM's result
+ in_s32 += (int4)RESULT_OFFSET;
+
+ // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+ __global uchar *result_multipliers_addr =
+ result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
+ __global uchar *result_shifts_addr =
+ result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
+ int4 result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr);
+ int4 result_shifts_values = vload4(0, (__global int *)result_shifts_addr);
+
+ in_s32 *= result_multipliers_values;
+ in_s32 >>= result_shifts_values;
+#else // defined(PER_CHANNEL_QUANTIZATION)
+ in_s32 *= RESULT_MULTIPLIER;
+
+ in_s32 >>= RESULT_SHIFT;
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+ VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+ res = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+ res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+ res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+ // Store the result
+ vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+
+/* OpenCL kernel used to add the offset contribution after matrix multiplication and it quantizes
+ * down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), adds to
+ * it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output
+ * stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns)
+ * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at
+ * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at
+ * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually
+ * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ * (sum_col[k] * A_OFFSET) +
+ * (sum_row[i] * B_OFFSET) +
+ * (K_OFFSET)
+ *
+ * This result is quantized down to uint8/int8 using the output stage. The output stage computes the
+ * following operations:
+ *
+ * -# Compute fixed point multiplication between each entry of input by
+ * result_fixedpoint_multiplier
+ * -# Add bias to final result if bias tensor is not a nullptr
+ * -# Round to nearest division by a power-of-two using result_shift
+ * -# Add offset to each result
+ * -# Clamp the value between the specified min and max bounds
+ * -# Clamp the resulting int32 values:
+ * - to the [0..255] range and cast to QASYMM8.
+ * - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in] mm_result_ptr Pointer to the source tensor.
+ * Supported data type: S32
+ * @param[in] mm_result_stride_x Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] mm_result_step_x mm_result_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in] mm_result_stride_y Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] mm_result_step_y mm_result_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in] mm_result_stride_z Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in] mm_result_step_z mm_result_stride_z * number of
+ * elements along Z processed per workitem(in bytes)
+ * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in
+ * the source tensor
+ * @param[in] sum_col_ptr (Optional) Pointer to the source
+ * tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in] sum_row_ptr (Optional) Pointer to the source
+ * tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in] biases_ptr (Optional) Pointer to the biases
+ * tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x (Optional) Stride of the biases
+ * tensor in X dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the biases tensor
+ * @param[out] dst_ptr Pointer to the destination tensor
+ * Supported data type: QASYMM8
+ * @param[in] dst_stride_x Stride of the destination tensor in
+ * X dimension (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in
+ * Y dimension (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in] dst_step_z src_stride_z * number of elements
+ * along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in
+ * the destination tensor
+ * @param[in] result_multipliers_ptr (Optional) Pointer to the output
+ * multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in] result_multipliers_stride_x (Optional) Stride of the output
+ * multipliers vector in X dimension (in bytes)
+ * @param[in] result_multipliers_step_x (Optional)
+ * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the output multipliers vector
+ * @param[in] result_shifts_ptr (Optional) Pointer to the output
+ * shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in] result_shifts_stride_x (Optional) Stride of the output
+ * shifts vector in X dimension (in bytes)
+ * @param[in] result_shifts_step_x (Optional) output_shifts_stride_x *
+ * number of elements along X processed per workitem(in bytes)
+ * @param[in] result_shifts_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the output shifts vector
+ */
+__kernel void
+gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+ ,
+#if defined(ADD_BIAS)
+ VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+ TENSOR3D_DECLARATION(dst)
+#if defined(PER_CHANNEL_QUANTIZATION)
+ ,
+ VECTOR_DECLARATION(result_multipliers),
+ VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+)
+{
+ const int x = get_global_id(0) * 4;
+ const int y = get_global_id(1);
+ const int z = get_global_id(2);
+
+ // Compute offset contribution
+ int4 offset_term_s32 = offset_contribution(
+ x, y, z
+#if defined(A_OFFSET)
+ ,
+ sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y,
+ sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+ ,
+ sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y,
+ sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+ ,
+ biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+ );
+
+ __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes +
+ x * sizeof(int) + y * mm_result_stride_y +
+ z * mm_result_stride_z;
+
+ __global uchar *dst_addr =
+ dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+ int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+ // Add the offset terms to GEMM's result
+ in_s32 += offset_term_s32;
+
+ // -------------- OUTPUT STAGE
+
+ // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+ __global uchar *result_multipliers_addr =
+ result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
+ __global uchar *result_shifts_addr =
+ result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
+ int4 result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr);
+ int4 result_shifts_values = vload4(0, (__global int *)result_shifts_addr);
+
+ int4 in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(
+ in_s32, result_multipliers_values, result_shifts_values, 4);
+ int4 in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(
+ in_s32, result_multipliers_values, result_shifts_values, 4);
+ in_s32 = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+
+#if RESULT_SHIFT < 0
+ in_s32 =
+ ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+#else // RESULT_SHIFT >= 0
+ in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+ // Add the offset terms to GEMM's result
+ in_s32 += (int4)RESULT_OFFSET;
+
+ VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+ res = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+ res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+ res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+ // Store the result
+ vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) &&
+ // defined(OUTPUT_DATA_TYPE)
+
+#endif // defined(K_OFFSET)
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to
+ * QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value and processes it to obtain the final
+ * QASYMM8/QASYMM8_SIGNED value. The following computations will be performed by the kernel:
+ *
+ * -# Add offset terms to final result
+ * -# Multiply each entry of result by result_mult_int
+ * -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ * -# Shift the int32 accumulator by result_shift
+ * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND
+ * are passed at compile time)
+ * -# Clamp the resulting int32 values:
+ * -# - to the [0..255] range and cast to QASYMM8.
+ * -# - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[in] biases_ptr (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the biases tensor
+ * @param[out] dst_ptr Pointer to the destination tensor Supported data
+ * type: QASYMM8/QASYMM8_SIGNED
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] dst_step_z src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+ VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+ TENSOR3D_DECLARATION(dst))
+{
+ // Compute source and destination addresses
+ int x = get_global_id(0) * 4;
+ int y = get_global_id(1);
+ int z = get_global_id(2);
+
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) +
+ y * src_stride_y + z * src_stride_z;
+
+ __global uchar *dst_addr =
+ dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+ int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+ // Add bias
+ __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+ int4 biases_values = vload4(0, (__global int *)bias_addr);
+ input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+ // Add the offset terms to GEMM's result
+ input_values += (int4)RESULT_OFFSET;
+
+ // Multiply by result_mult_int and shift
+ input_values *= RESULT_MULT_INT;
+
+#if RESULT_SHIFT < 0
+ input_values >>= -RESULT_SHIFT;
+#else // RESULT_SHIFT >= 0
+ input_values >>= RESULT_SHIFT;
+#endif // RESULT_SHIFT < 0
+
+ VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+ res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+ res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+ res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+ // Store the result
+ vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+
+#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && \
+ defined(RESULT_SHIFT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to
+ * QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and
+ * processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. The following computations will be
+ * performed by the kernel:
+ *
+ * -# Compute fixed point multiplication between each entry of input by
+ * result_fixedpoint_multiplier
+ * -# Add bias to final result if bias tensor is not a nullptr
+ * -# Round to nearest division by a power-of-two using result_shift
+ * -# Add offset to each result
+ * -# Clamp the value between the specified min and max bounds
+ * -# Clamp the resulting int32 values:
+ * - to the [0..255] range and cast to QASYMM8.
+ * - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER
+ * and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[in] biases_ptr (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the biases tensor
+ * @param[out] dst_ptr Pointer to the destination tensor Supported data
+ * type: QASYMM8/QASYMM8_SIGNED
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] dst_step_z src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+ VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+ TENSOR3D_DECLARATION(dst))
+{
+ // Compute source and destination addresses
+ int x = get_global_id(0) * 4;
+ int y = get_global_id(1);
+ int z = get_global_id(2);
+
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) +
+ y * src_stride_y + z * src_stride_z;
+
+ __global uchar *dst_addr =
+ dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+ int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+ // Add bias
+ __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+ int4 biases_values = vload4(0, (__global int *)bias_addr);
+ input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+ // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+ input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(
+ input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#else // RESULT_SHIFT >= 0
+ input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(
+ input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
+ // Add the offset terms to GEMM's result
+ input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;
+
+ VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+ res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+ res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+ res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+ // Store the result
+ vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) &&
+ // defined(RESULT_SHIFT)
+
+#if defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and
+ * processes it to obtain the final QSYMM16 value. The following computations will be performed by
+ * the kernel:
+ *
+ * -# Compute fixed point multiplication between each entry of input by
+ * result_fixedpoint_multiplier
+ * -# Add bias to final result if bias tensor is not a nullptr
+ * -# Round to nearest division by a power-of-two using result_shift
+ * -# Add offset to each result
+ * -# Clamp the value between the specified min and max bounds
+ * -# Clamp the resulting int32 values to the [-32768..32767] range and cast to QSYMM16.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[in] biases_ptr (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in] biases_step_x (Optional) biases_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the biases tensor
+ * @param[out] dst_ptr Pointer to the destination tensor Supported data
+ * type: QSYMM16
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] dst_step_z src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+ VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+ TENSOR3D_DECLARATION(dst))
+{
+ // Compute source and destination addresses
+ int x = get_global_id(0) * 4;
+ int y = get_global_id(1);
+ int z = get_global_id(2);
+
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) +
+ y * src_stride_y + z * src_stride_z;
+
+ __global uchar *dst_addr =
+ dst_ptr + dst_offset_first_element_in_bytes + x * 2 + y * dst_stride_y + z * dst_stride_z;
+
+ int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+ // Add bias
+ __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+ int4 biases_values = vload4(0, (__global int *)bias_addr);
+ input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+ // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+ input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(
+ input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#else // RESULT_SHIFT >= 0
+ input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(
+ input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
+ short4 res = convert_short4_sat(input_values);
+
+#if defined(MIN_BOUND)
+ res = max(res, (short4)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+ res = min(res, (short4)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+ // Store the result
+ vstore4(res, 0, (__global short *)dst_addr);
+}
+#endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to
+ * QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and
+ * processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. The following computations will be
+ * performed by the kernel:
+ *
+ * -# Compute fixed point multiplication between each entry of input by
+ * result_fixedpoint_multiplier
+ * -# Add bias to final result if bias tensor is not a nullptr
+ * -# Requantize
+ * -# Add offset to each result
+ * -# Clamp the value between the specified min and max bounds
+ * -# Clamp the resulting int32 values:
+ * - to the [0..255] range and cast to QASYMM8.
+ * - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset and scalar scale factor must be passed at compile time using
+ * -DRESULT_OFFSET, -DREAL_MULTIPLIER
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Supported data
+ * type: same as @p src_ptr
+ * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in
+ * bytes)
+ * @param[in] biases_step_x biases_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases
+ * tensor
+ * @param[out] dst_ptr Pointer to the destination tensor Supported data
+ * type: QASYMM8
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in] dst_step_x dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] dst_step_z src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in] dst_step_w src_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+ VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+#if defined(DST_HEIGHT)
+ TENSOR4D_DECLARATION(dst))
+#else // defined(DST_HEIGHT)
+ TENSOR3D_DECLARATION(dst))
+#endif // defined(DST_HEIGHT)
+{
+ // Compute source and destination addresses
+ int x = get_global_id(0) * 4;
+ int y = get_global_id(1);
+ int z = get_global_id(2);
+
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) +
+ y * src_stride_y + z * src_stride_z;
+
+ __global uchar *dst_addr =
+ dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+ int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+ // Add bias
+ __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+ int4 biases_values = vload4(0, (__global int *)bias_addr);
+ input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+ // Convert to float
+ float4 input_values_f = convert_float4(input_values);
+ input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
+
+ VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+ res = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+ res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+ res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+ // Store the result
+ vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl
index 80ba73d1d..85fc09de4 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl
@@ -41,7 +41,7 @@
#include "helpers.h"
#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && \
- defined(COLS_A)
+ defined(COLS_A)
#define VECTOR_CHAR VEC_DATA_TYPE(char, NUM_ELEMS_PROCESSED_PER_THREAD_X)
#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
#define VECTOR_FLOAT VEC_DATA_TYPE(float, NUM_ELEMS_PROCESSED_PER_THREAD_X)
@@ -117,7 +117,7 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(
,
uint dst_cross_plane_pad
#endif // REINTERPRET_OUTPUT_AS_3D
- )
+)
{
int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
@@ -208,9 +208,9 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
// Load values from matrix B
VECTOR_CHAR b0 =
- VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
+ VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
VECTOR_CHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(
- 0, (__global char *)(src1_ptr + src_addr.s1 + src1_stride_y));
+ 0, (__global char *)(src1_ptr + src_addr.s1 + src1_stride_y));
// Accumulate
acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0.s0;
@@ -251,7 +251,7 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
// Load values from matrix B
VECTOR_CHAR b0 =
- VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
+ VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
// Accumulate
acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0;
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
index a4f7dbd48..3ace1fde8 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
@@ -115,15 +115,15 @@ __kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION
int lup_id[4] = {0};
- lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0)))
- : get_global_id(0);
- lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1)))
- : get_global_id(1);
+ lup_id[0] =
+ (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) : get_global_id(0);
+ lup_id[1] =
+ (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) : get_global_id(1);
lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2)))
: get_global_id(2) % DEPTH_OUT;
lup_id[3] = (NUM_DIMS == 4)
- ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
- : get_global_id(2) / DEPTH_OUT;
+ ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+ : get_global_id(2) / DEPTH_OUT;
if (lup_id[NUM_DIMS - 1] < 0)
{
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
index e07a25ec9..4a3bc1369 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
@@ -49,7 +49,7 @@
#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
- defined(cl_arm_integer_dot_product_accumulate_int8)
+ defined(cl_arm_integer_dot_product_accumulate_int8)
#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
// defined(cl_arm_integer_dot_product_accumulate_int8)
@@ -288,21 +288,21 @@
#define VECTOR_DECLARATION(name) \
__global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \
- uint name##_offset_first_element_in_bytes
+ uint name##_offset_first_element_in_bytes
#define IMAGE_DECLARATION(name) \
__global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
- uint name##_step_y, uint name##_offset_first_element_in_bytes
+ uint name##_step_y, uint name##_offset_first_element_in_bytes
#define TENSOR3D_DECLARATION(name) \
__global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
- uint name##_step_y, uint name##_stride_z, uint name##_step_z, \
- uint name##_offset_first_element_in_bytes
+ uint name##_step_y, uint name##_stride_z, uint name##_step_z, \
+ uint name##_offset_first_element_in_bytes
#define TENSOR4D_DECLARATION(name) \
__global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
- uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \
- uint name##_step_w, uint name##_offset_first_element_in_bytes
+ uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \
+ uint name##_step_w, uint name##_offset_first_element_in_bytes
#define CONVERT_TO_VECTOR_STRUCT(name) \
update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
@@ -406,9 +406,9 @@ inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_
uint stride_x, uint step_x)
{
Vector vector = {
- .ptr = ptr,
- .offset_first_element_in_bytes = offset_first_element_in_bytes,
- .stride_x = stride_x,
+ .ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
};
vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
return vector;
@@ -436,7 +436,7 @@ inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el
.stride_x = stride_x,
.stride_y = stride_y};
img.ptr +=
- img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+ img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
return img;
}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
index 5f1b3f902..d7f1d0814 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
@@ -100,16 +100,16 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*
* @return quantized values
*/
-#define QUANTIZE_IMPL(type, size) \
- inline VEC_DATA_TYPE(type, size) \
- quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
- { \
- VEC_DATA_TYPE(float, size) \
- out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \
- VEC_DATA_TYPE(type, size) \
- res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), \
- VEC_DATA_TYPE(type, size)); \
- return res; \
+#define QUANTIZE_IMPL(type, size) \
+ inline VEC_DATA_TYPE(type, size) \
+ quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
+ { \
+ VEC_DATA_TYPE(float, size) \
+ out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \
+ VEC_DATA_TYPE(type, size) \
+ res = \
+ CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \
+ return res; \
}
/** Dequantize a vector of values to floating-point
@@ -119,11 +119,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*
* @return dequantized values in floating point
*/
-#define DEQUANTIZE_IMPL(type, size) \
- inline VEC_DATA_TYPE(float, size) \
- dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
- { \
- return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \
+#define DEQUANTIZE_IMPL(type, size) \
+ inline VEC_DATA_TYPE(float, size) \
+ dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+ { \
+ return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \
}
/** Correctly-rounded-to-nearest division by a power-of-two.
@@ -134,7 +134,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*/
#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \
inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \
- VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
+ VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
{ \
const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0; \
const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \
@@ -152,32 +152,32 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*
* @return Product of two fixed-point numbers.
*/
-#define ASYMM_MULT_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) \
- asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
- { \
- VEC_DATA_TYPE(int, size) \
- overflow = a == b && a == INT_MIN; \
- VEC_DATA_TYPE(long, size) \
- a_64 = convert_long##size(a); \
- VEC_DATA_TYPE(long, size) \
- b_64 = convert_long##size(b); \
- VEC_DATA_TYPE(long, size) \
- ab_64 = a_64 * b_64; \
- /* Revert COMPMID-907 */ \
- VEC_DATA_TYPE(long, size) \
- mask1 = 1 << 30; \
- VEC_DATA_TYPE(long, size) \
- mask2 = 1 - (1 << 30); \
- VEC_DATA_TYPE(long, size) \
- is_positive_or_zero = ab_64 >= 0; \
- VEC_DATA_TYPE(long, size) \
- nudge = select(mask2, mask1, is_positive_or_zero); \
- VEC_DATA_TYPE(long, size) \
- mask = 1ll << 31; \
- VEC_DATA_TYPE(int, size) \
- ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \
- return select(ab_x2_high32, INT_MAX, overflow); \
+#define ASYMM_MULT_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+ { \
+ VEC_DATA_TYPE(int, size) \
+ overflow = a == b && a == INT_MIN; \
+ VEC_DATA_TYPE(long, size) \
+ a_64 = convert_long##size(a); \
+ VEC_DATA_TYPE(long, size) \
+ b_64 = convert_long##size(b); \
+ VEC_DATA_TYPE(long, size) \
+ ab_64 = a_64 * b_64; \
+ /* Revert COMPMID-907 */ \
+ VEC_DATA_TYPE(long, size) \
+ mask1 = 1 << 30; \
+ VEC_DATA_TYPE(long, size) \
+ mask2 = 1 - (1 << 30); \
+ VEC_DATA_TYPE(long, size) \
+ is_positive_or_zero = ab_64 >= 0; \
+ VEC_DATA_TYPE(long, size) \
+ nudge = select(mask2, mask1, is_positive_or_zero); \
+ VEC_DATA_TYPE(long, size) \
+ mask = 1ll << 31; \
+ VEC_DATA_TYPE(int, size) \
+ ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \
+ return select(ab_x2_high32, INT_MAX, overflow); \
}
/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
@@ -186,32 +186,32 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*
* @return Result in fixed-point format Q0.
*/
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) \
- asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \
- a) \
- { \
- const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \
- const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \
- const int k_fractional_bits = 31; \
- VEC_DATA_TYPE(int, size) \
- x = a + (1 << (k_fractional_bits - 3)); \
- VEC_DATA_TYPE(int, size) \
- x2 = ASYMM_MULT(x, x, size); \
- VEC_DATA_TYPE(int, size) \
- x3 = ASYMM_MULT(x2, x, size); \
- VEC_DATA_TYPE(int, size) \
- x4 = ASYMM_MULT(x2, x2, size); \
- VEC_DATA_TYPE(int, size) \
- x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \
- VEC_DATA_TYPE(int, size) \
- x4_over_24_plus_x3_over_6_plus_x2 = \
- ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \
- VEC_DATA_TYPE(int, size) \
- x4_over_24_plus_x3_over_6_plus_x2_over_2 = \
- ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \
- return constant_term + \
- ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \
+ a) \
+ { \
+ const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \
+ const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \
+ const int k_fractional_bits = 31; \
+ VEC_DATA_TYPE(int, size) \
+ x = a + (1 << (k_fractional_bits - 3)); \
+ VEC_DATA_TYPE(int, size) \
+ x2 = ASYMM_MULT(x, x, size); \
+ VEC_DATA_TYPE(int, size) \
+ x3 = ASYMM_MULT(x2, x, size); \
+ VEC_DATA_TYPE(int, size) \
+ x4 = ASYMM_MULT(x2, x2, size); \
+ VEC_DATA_TYPE(int, size) \
+ x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \
+ VEC_DATA_TYPE(int, size) \
+ x4_over_24_plus_x3_over_6_plus_x2 = \
+ ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \
+ VEC_DATA_TYPE(int, size) \
+ x4_over_24_plus_x3_over_6_plus_x2_over_2 = \
+ ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \
+ return constant_term + \
+ ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \
}
/** Each bit of the result is set to the corresponding bit of either then_val or
@@ -263,15 +263,15 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
#define EXP_BARREL_SHIFTER_IMPL(size) \
inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size( \
- VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \
- int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
+ VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \
+ int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
{ \
if (k_integer_bits > exponent) \
{ \
const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \
return ASYMM_SELECT_USING_MASK( \
- ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \
- ASYMM_MULT(result, fp_multiplier, size), result, size); \
+ ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \
+ ASYMM_MULT(result, fp_multiplier, size), result, size); \
} \
\
return result; \
@@ -285,7 +285,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*/
#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \
inline VEC_DATA_TYPE(int, size) \
- asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \
+ asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \
{ \
const int k_fractional_bits = 31 - k_integer_bits; \
VEC_DATA_TYPE(int, size) \
@@ -298,7 +298,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \
VEC_DATA_TYPE(int, size) \
result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL( \
- a_mod_quarter_minus_one_quarter_scaled, size); \
+ a_mod_quarter_minus_one_quarter_scaled, size); \
VEC_DATA_TYPE(int, size) \
remainder = a_mod_quarter_minus_one_quarter - a; \
\
@@ -312,10 +312,10 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
remainder, size); \
result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, \
remainder, size); \
- result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \
- size); \
result = \
- EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \
+ EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \
+ result = \
+ EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \
\
if (k_integer_bits > 5) \
{ \
@@ -335,27 +335,27 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*
* @return Arithmetic left or right shift.
*/
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) \
- asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
- { \
- if (exponent < 0) \
- { \
- return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \
- } \
- \
- const VEC_DATA_TYPE(int, size) min = INT_MIN; \
- const VEC_DATA_TYPE(int, size) max = INT_MAX; \
- int threshold = ((1 << (31 - exponent)) - 1); \
- VEC_DATA_TYPE(int, size) \
- positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \
- VEC_DATA_TYPE(int, size) \
- negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \
- VEC_DATA_TYPE(int, size) \
- result = x << exponent; \
- result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \
- result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \
- return result; \
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+ { \
+ if (exponent < 0) \
+ { \
+ return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \
+ } \
+ \
+ const VEC_DATA_TYPE(int, size) min = INT_MIN; \
+ const VEC_DATA_TYPE(int, size) max = INT_MAX; \
+ int threshold = ((1 << (31 - exponent)) - 1); \
+ VEC_DATA_TYPE(int, size) \
+ positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \
+ VEC_DATA_TYPE(int, size) \
+ negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \
+ VEC_DATA_TYPE(int, size) \
+ result = x << exponent; \
+ result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \
+ result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \
+ return result; \
}
/** Calculates (a+b)/2, rounded to the nearest integer.
@@ -365,21 +365,21 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*
* @return (a+b)/2, rounded to the nearest integer.
*/
-#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) \
- asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
- { \
- VEC_DATA_TYPE(long, size) \
- a64 = convert_long##size(a); \
- VEC_DATA_TYPE(long, size) \
- b64 = convert_long##size(b); \
- VEC_DATA_TYPE(long, size) \
- sum = a64 + b64; \
- const VEC_DATA_TYPE(long, size) one = 1; \
- const VEC_DATA_TYPE(long, size) minus_one = -1; \
- VEC_DATA_TYPE(long, size) \
- sign = select(minus_one, one, sum >= 0); \
- return convert_int##size((sum + sign) / 2); \
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+ { \
+ VEC_DATA_TYPE(long, size) \
+ a64 = convert_long##size(a); \
+ VEC_DATA_TYPE(long, size) \
+ b64 = convert_long##size(b); \
+ VEC_DATA_TYPE(long, size) \
+ sum = a64 + b64; \
+ const VEC_DATA_TYPE(long, size) one = 1; \
+ const VEC_DATA_TYPE(long, size) minus_one = -1; \
+ VEC_DATA_TYPE(long, size) \
+ sign = select(minus_one, one, sum >= 0); \
+ return convert_int##size((sum + sign) / 2); \
}
/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
@@ -390,7 +390,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
*/
#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \
inline VEC_DATA_TYPE(int, size) \
- asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+ asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
{ \
const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \
const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \
@@ -462,14 +462,14 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \
- inline VEC_DATA_TYPE(int, size) \
- multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
- { \
- const int left_shift = shift > 0 ? shift : 0; \
- const int right_shift = shift > 0 ? 0 : -shift; \
- return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), \
- right_shift, size); \
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \
+ inline VEC_DATA_TYPE(int, size) \
+ multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
+ { \
+ const int left_shift = shift > 0 ? shift : 0; \
+ const int right_shift = shift > 0 ? 0 : -shift; \
+ return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), \
+ right_shift, size); \
}
#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
multiply_by_quantized_multiplier##size(input, qmul, shift)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
index 014842680..96a243110 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
@@ -41,7 +41,7 @@
#include "helpers.h"
#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \
- defined(DIM_Y) && defined(DIM_Z)
+ defined(DIM_Y) && defined(DIM_Z)
/** This function normalizes the input 2D tensor across the first dimension with respect to mean and
* standard deviation of the same dimension.
*
@@ -108,14 +108,14 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
TENSOR4D_DECLARATION(output)
#endif /* IN_PLACE */
#ifdef GAMMA
- ,
+ ,
VECTOR_DECLARATION(gamma)
#endif // GAMMA
#ifdef BETA
- ,
+ ,
VECTOR_DECLARATION(beta)
#endif // BETA
- )
+)
{
Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
#ifndef IN_PLACE
@@ -213,12 +213,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
for (int i_h = 0; i_h < DIM_Z; ++i_h)
{
__global DATA_TYPE *input_address =
- (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
+ (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
#ifdef IN_PLACE
__global DATA_TYPE *output_address = input_address;
#else /* !IN_PLACE */
__global DATA_TYPE *output_address =
- (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
+ (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
#endif /* IN_PLACE */
*(output_address) = (*(input_address)-mean) * multip + beta;
}
@@ -231,12 +231,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
{
__global DATA_TYPE *input_address =
- (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+ (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
#ifdef IN_PLACE
__global DATA_TYPE *output_address = input_address;
#else /* !IN_PLACE */
__global DATA_TYPE *output_address =
- (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+ (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
#endif /* IN_PLACE */
VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
@@ -251,12 +251,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
for (; x < DIM_X; ++x)
{
__global DATA_TYPE *input_address =
- (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+ (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
#ifdef IN_PLACE
__global DATA_TYPE *output_address = input_address;
#else /* !IN_PLACE */
__global DATA_TYPE *output_address =
- (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+ (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
#endif /* IN_PLACE */
*(output_address) = (*(input_address)-mean) * multip + beta;
}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl
new file mode 100644
index 000000000..51919c8a5
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(CONSTANT_VALUE) // Check for compile time constants
+
+/** Fill the tensor's planes with all value
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DCONSTANT_VALUE = The value use to fill the tensor's planes
+ * -# -DVEC_SIZE = Vector size
+ * -# -DLAST_ACCESSED_X = The element that is on the X border (threads trying to set this, might
+ * need to step back a bit)
+ *
+ * @param[in] tensor_ptr Pointer to the source image. Data types
+ * supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] tensor_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] tensor_step_x tensor_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] tensor_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] tensor_step_y tensor_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] tensor_offset_first_element_in_bytes The offset of the first element in the source
+ * image
+ * @param[in] value The value used to fill the pages of the tensor
+ */
+__kernel void memset(TENSOR3D_DECLARATION(tensor))
+{
+ Tensor3D tensor = CONVERT_TO_TENSOR3D_STRUCT(tensor);
+
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi = (int)(get_global_id(0) * VEC_SIZE);
+ tensor.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * tensor_stride_x;
+#endif // defined(LAST_ACCESSED_X)
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ data = (DATA_TYPE)(CONSTANT_VALUE);
+
+ VSTORE(VEC_SIZE)
+ (data, 0, (__global DATA_TYPE *)tensor.ptr);
+#else // !defined(VEC_SIZE)
+ *((__global DATA_TYPE *)(tensor.ptr)) = (DATA_TYPE)(CONSTANT_VALUE);
+#endif // defined(VEC_SIZE)
+}
+
+#endif // Check for compile time constants
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl
index 3943fc4c2..abbfbd275 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl
@@ -114,8 +114,8 @@ __kernel void multiply_scale_factor(IMAGE_DECLARATION(input), VECTOR_DECLARATION
(val, 0, (__global DATA_TYPE *)output.ptr);
#else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
*((__global DATA_TYPE *)(output.ptr)) =
- ((DATA_TYPE)(*((__global int *)(input.ptr)))) *
- *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)) * (DATA_TYPE)(multiplier);
+ ((DATA_TYPE)(*((__global int *)(input.ptr)))) *
+ *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)) * (DATA_TYPE)(multiplier);
#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl
new file mode 100644
index 000000000..784a8d6aa
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(AXIS) && defined(DEPTH) && defined(OUTPUT_DIM_Z)
+
+/** Performs the OneHot operation along the chosen axis
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ * -DDATA_TYPE=short
+ * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
+ * @attention Input tensor depth should be given as a preprocessor argument using
+ * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
+ *
+ *
+ * @param[in] indices_ptr Pointer to the source tensor. Supported data
+ * types: S32
+ * @param[in] indices_stride_x Stride of the source tensor in X dimension
+ * (in bytes)
+ * @param[in] indices_step_x indices_stride_x * number of elements along
+ * X processed per work item (in bytes)
+ * @param[in] indices_stride_y Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in] indices_step_y indices_stride_y * number of elements along
+ * Y processed per work item (in bytes)
+ * @param[in] indices_stride_z Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in] indices_step_z indices_stride_z * number of elements along
+ * Z processed per work item (in bytes)
+ * @param[in] indices_offset_first_element_in_bytes Offset of the first element in the source
+ * tensor
+ * @param[in] on_value_ptr Pointer to the on_value vector. Supported
+ * data types: U8/S8/U16/S16/F16/U32/S32/F32.
+ * @param[in] on_value_stride_x Stride of the on_value vector in X dimension
+ * (in bytes)
+ * @param[in] on_value_step_x on_value_stride_x * number of elements along
+ * X processed per work item (in bytes)
+ * @param[in] on_value_offset_first_element_in_bytes Offset of the first element in the on_value
+ * vector
+ * @param[in] off_value_ptr Pointer to the off_value vector. Supported
+ * data types: Same as @p on_value.
+ * @param[in] off_value_stride_x Stride of the off_value vector in X
+ * dimension (in bytes)
+ * @param[in] off_value_step_x off_value_stride_x * number of elements
+ * along X processed per work item (in bytes)
+ * @param[in] off_value_offset_first_element_in_bytes Offset of the first element in the off_value
+ * vector
+ * @param[out] output_ptr Pointer to the destination tensor. Supported
+ * data types: same as @p on_value
+ * @param[in] output_stride_x Stride of the destination tensor in X
+ * dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per work item (in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y
+ * dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per work item (in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z
+ * dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per work item (in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W
+ * dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W
+ * processed per work item (in bytes)
+ * @param[in] output_offset_first_element_in_bytes Offset of the first element in the
+ * destination tensor
+ */
+__kernel void one_hot(TENSOR3D_DECLARATION(indices), VECTOR_DECLARATION(on_value),
+ VECTOR_DECLARATION(off_value), TENSOR4D_DECLARATION(output))
+{
+ const int px = get_global_id(0);
+ const int py = get_global_id(1);
+ const int pz = get_global_id(2) % OUTPUT_DIM_Z;
+ const int pw = get_global_id(2) / OUTPUT_DIM_Z;
+
+ const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
+ Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
+
+#if AXIS == 0
+ const int index = *(__global const int *)tensor3D_offset(&indices, py, pz, pw);
+ *(__global DATA_TYPE *)output.ptr = index == px ? *((__global const DATA_TYPE *)on_value_ptr)
+ : *((__global const DATA_TYPE *)off_value_ptr);
+#elif AXIS == 1
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, px, pz, pw);
+ *(__global DATA_TYPE *)output.ptr = index == py ? *((__global const DATA_TYPE *)on_value_ptr)
+ : *((__global const DATA_TYPE *)off_value_ptr);
+#elif AXIS == 2
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pw);
+ *(__global DATA_TYPE *)output.ptr = index == pz ? *((__global const DATA_TYPE *)on_value_ptr)
+ : *((__global const DATA_TYPE *)off_value_ptr);
+#elif AXIS == 3
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz);
+ *(__global DATA_TYPE *)output.ptr = index == pw ? *((__global const DATA_TYPE *)on_value_ptr)
+ : *((__global const DATA_TYPE *)off_value_ptr);
+#endif // AXIS
+}
+
+/** Performs the OneHot operation along the chosen axis as off_value being zero
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ * -DDATA_TYPE=short
+ * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
+ * @attention Output tensor depth should be given as a preprocessor argument using
+ * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
+ * @attention Input tensor depth should be given as a preprocessor argument using
+ * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
+ *
+ *
+ * @param[in] indices_ptr Pointer to the source tensor. Supported data
+ * types: S32
+ * @param[in] indices_stride_x Stride of the source tensor in X dimension
+ * (in bytes)
+ * @param[in] indices_step_x indices_stride_x * number of elements along
+ * X processed per work item (in bytes)
+ * @param[in] indices_stride_y Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in] indices_step_y indices_stride_y * number of elements along
+ * Y processed per work item (in bytes)
+ * @param[in] indices_stride_z Stride of the source tensor in Y dimension
+ * (in bytes)
+ * @param[in] indices_step_z indices_stride_z * number of elements along
+ * Z processed per work item (in bytes)
+ * @param[in] indices_offset_first_element_in_bytes Offset of the first element in the source
+ * tensor
+ * @param[in] on_value_ptr Pointer to the on_value vector. Supported
+ * data types: U8/S8/U16/S16/F16/U32/S32/F32.
+ * @param[in] on_value_stride_x Stride of the on_value vector in X dimension
+ * (in bytes)
+ * @param[in] on_value_step_x on_value_stride_x * number of elements along
+ * X processed per work item (in bytes)
+ * @param[in] on_value_offset_first_element_in_bytes Offset of the first element in the on_value
+ * vector
+ * @param[out] output_ptr Pointer to the destination tensor. Supported
+ * data types: same as @p on_value
+ * @param[in] output_stride_x Stride of the destination tensor in X
+ * dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per work item (in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y
+ * dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per work item (in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z
+ * dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per work item (in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W
+ * dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W
+ * processed per work item (in bytes)
+ * @param[in] output_offset_first_element_in_bytes Offset of the first element in the
+ * destination tensor
+ */
+__kernel void one_hot_only_on_value(TENSOR3D_DECLARATION(indices), VECTOR_DECLARATION(on_value),
+ TENSOR4D_DECLARATION(output))
+{
+ const int px = get_global_id(0);
+ const int py = get_global_id(1);
+ const int pz = get_global_id(2);
+
+ const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
+ const Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, OUTPUT_DIM_Z);
+
+ const int index = *(__global const int *)tensor3D_offset(&indices, px, py, pz);
+
+ if (index < 0 || index >= DEPTH)
+ return;
+
+#if AXIS == 0
+ *(__global DATA_TYPE *)tensor4D_offset(&output, index, px, py, pz) =
+ *((__global const DATA_TYPE *)on_value_ptr);
+#elif AXIS == 1
+ *(__global DATA_TYPE *)tensor4D_offset(&output, px, index, py, pz) =
+ *((__global const DATA_TYPE *)on_value_ptr);
+#elif AXIS == 2
+ *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, index, pz) =
+ *((__global const DATA_TYPE *)on_value_ptr);
+#elif AXIS == 3
+ *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, pz, index) =
+ *((__global const DATA_TYPE *)on_value_ptr);
+#endif // AXIS
+}
+
+#endif // defined(DATA_TYPE) && defined(AXIS) && defined(DEPTH) && defined(OUTPUT_DIM_Z)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl
new file mode 100644
index 000000000..96f2f9ef0
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && \
+ defined(SRC_WIDTH)
+
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_SELECT VEC_DATA_TYPE(SELECT_DT, VEC_SIZE)
+#define OFFSETS VEC_OFFS(VEC_SELECT, VEC_SIZE)
+
+#if defined(CONST_VAL)
+/** Perform a pad operation when PaddingMode is CONSTANT
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
+ * @note Constant value used to fill the pads must be passed using the -DCONST_VAL compile flag,
+ * e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g.
+ * -DPAD_X_BEFORE=5
+ * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g.
+ * -DSRC_WIDTH=224
+ * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile
+ * flag, e.g. -DSELECT_DT=float
+ * @note In case pad left is more than the vector size, the number of threads to skip along the X
+ * axis must be passed using the -DNUM_THREADS_TO_SKIP_X compile flag, e.g.
+ * -DNUM_THREADS_TO_SKIP_X=1. This is defined as (PAD_X_BEFORE / VEC_SIZE)
+ * @note If pad also needs to be added to the top of the tensor, the following compile flags must be
+ * passed at compile time:
+ * -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
+ * -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
+ * @note If pad also needs to be added to the depth of the tensor, the following compile flags must
+ * be passed at compile time:
+ * -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g.
+ * -DPAD_Z_BEFORE=3)
+ * -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
+ * @note If pad also needs to be added to the batch of the tensor, the following compile flags must
+ * be passed at compile time:
+ * -# -DPAD_W_BEFORE: Pad to add before the first batch of the input tensor (e.g.
+ * -DPAD_W_BEFORE=3)
+ * -# -DSRC_BATCH: Input tensor's batch size (e.g. -DSRC_BATCH=4)
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types:
+ * U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source image in Z dimension (in
+ * bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data
+ * types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in
+ * bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in
+ * bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination image in Z dimension (in
+ * bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * image
+ * @param[in] batch (Optional) Batch index if 4D pad must be applied
+ */
+__kernel void pad_layer_constant(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)
+#if defined(PAD_W_BEFORE)
+ ,
+ uint batch
+#endif // defined(PAD_W_BEFORE)
+)
+{
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+ const int z = get_global_id(2);
+
+ uint cond = 0;
+
+#if defined(PAD_W_BEFORE)
+ cond |= batch < PAD_W_BEFORE || batch >= (SRC_BATCH + PAD_W_BEFORE);
+#endif // defined(PAD_W_BEFORE)
+#if defined(PAD_Z_BEFORE)
+ cond |= z < PAD_Z_BEFORE || z >= (SRC_DEPTH + PAD_Z_BEFORE);
+#endif // defined(PAD_Z_BEFORE)
+
+ if (cond)
+ {
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ VSTORE(VEC_SIZE)
+ ((VEC_TYPE)CONST_VAL, 0, (__global DATA_TYPE *)dst.ptr);
+ }
+ else
+ {
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(NUM_THREADS_TO_SKIP_X)
+ /* In case the pad left is greater than the vector size, and we are past the threads operating
+ * solely on pad values, the input pointer must be brought back along the X axis to start from
+ * the first non-pad values.
+ *
+ * E.g. with VEC_SIZE=2, PAD_X_BEFORE=5, CONST_VAL=0 and 1D input |1 2 3 4 5 6|:
+ * -# The first thread will compute the output values |0 0| since it detects (x_outs == (0, 1))
+ * < PAD_X_BEFORE
+ * -# The second thread will compute the output values |0 0| since it detects (x_outs == (2,
+ * 3)) < PAD_X_BEFORE
+ * -# The third thread should compute |0 1|, however the input pointer is now ahead of ((x *
+ * VEC_SIZE) == 4) values, reading |4 5|
+ * -# To detect this, we use ((PAD_X_BEFORE / VEC_SIZE) == NUM_THREADS_TO_SKIP_X == 2) and
+ * check that it is >= to the current x
+ * -# So, we bring the pointer back of NUM_THREADS_TO_SKIP_X threads, which means multiplying
+ * this constant by the input's step along the X axis
+ * -# Now that the pointer is back of ((NUM_THREADS_TO_SKIP_X * src_step_x) == 4) values, it
+ * will read the desired values |0 1|
+ */
+ src.ptr -= select(0u, NUM_THREADS_TO_SKIP_X * src_step_x, x >= NUM_THREADS_TO_SKIP_X);
+#endif // defined(NUM_THREADS_TO_SKIP_X)
+#if defined(PAD_Z_BEFORE)
+ src.ptr -= PAD_Z_BEFORE * src_step_z;
+#endif // defined(PAD_Z_BEFORE)
+#if defined(PAD_W_BEFORE)
+ src.ptr -= PAD_W_BEFORE * SRC_DEPTH * src_step_z;
+#endif // defined(PAD_W_BEFORE)
+
+ VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+
+ VEC_INT xs_out = (VEC_INT)(x * VEC_SIZE) + CONVERT(OFFSETS, VEC_INT);
+ VEC_INT cond = xs_out < (VEC_INT)PAD_X_BEFORE || xs_out >= (VEC_INT)(SRC_WIDTH + PAD_X_BEFORE);
+#if defined(PAD_Y_BEFORE)
+ cond |=
+ (VEC_INT)y < (VEC_INT)PAD_Y_BEFORE || (VEC_INT)y >= (VEC_INT)(SRC_HEIGHT + PAD_Y_BEFORE);
+#endif // defined(PAD_Y_BEFORE)
+ VSTORE(VEC_SIZE)
+ (select(src_vals, (VEC_TYPE)CONST_VAL, CONVERT(cond, VEC_SELECT)), 0,
+ (__global DATA_TYPE *)dst.ptr);
+ }
+}
+#endif // defined(CONST_VAL)
+
+#if defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) && \
+ defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && \
+ defined(AFTER_PAD_FACT_X)
+
+#define SCALAR_COND(x) (VEC_SELECT) x == (VEC_SELECT)1
+#define ROTATE_REVERSE(x, n) ROTATE(REVERSE(x, VEC_SIZE), VEC_SIZE, n)
+#define SYMM_REFL_LEFT(x, n0, n1) \
+ select(ROTATE_REVERSE(x, n1), ROTATE(x, VEC_SIZE, n0), OFFSETS >= (VEC_SELECT)n0)
+#define SYMM_REFL_RIGHT(x, n0, n1) \
+ select(ROTATE(x, VEC_SIZE, n0), ROTATE_REVERSE(x, n1), OFFSETS >= (VEC_SELECT)n0)
+
+/** Perform a pad operation when PaddingMode is SYMMETRIC
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
+ * @note Constant value must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g.
+ * -DPAD_X_BEFORE=5
+ * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g.
+ * -DSRC_WIDTH=224
+ * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile
+ * flag, e.g. -DSELECT_DT=float
+ * @note Number of values to the left when operating across left padding must be passed using the
+ * -DPAD_X_BEFORE_REMAINDER compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=5
+ * @note Number of values to the left when operating across right padding must be passed using the
+ * -DPAD_X_AFTER_REMAINDER compile flag, e.g. -DPAD_X_AFTER_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_BEFORE_REMAINDER + 1) must be passed when mode is
+ * REFLECT using the -DPAD_X_BEFORE_REMAINDER_REFL compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_AFTER_REMAINDER - 1) must be passed using the
+ * -DPAD_X_AFTER_REMAINDER_REFL compile flag, e.g. -DPAD_X_AFTER_REMAINDER=5
+ * @note When after pad X, starting point to read backward from must be passed using the
+ * -DAFTER_PAD_FACT_X compile flag, e.g. -DAFTER_PAD_FACT_X=253
+ * @note If padding mode is REFLECT, the -DIS_REFLECT compile flag must be set to 1, else it must be
+ * set to 0
+ * @note If pad also needs to be added to the top of the tensor, the following compile flags must be
+ * passed at compile time:
+ * -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
+ * -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
+ * @note If pad also needs to be added to the depth of the tensor, the following compile flags must
+ * be passed at compile time:
+ * -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g.
+ * -DPAD_Z_BEFORE=3)
+ * -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
+ * @note If the starting point to read backward from is less than the output's last element accessed
+ * in the X, the following compile flags must be passed at compile time to avoid negative offsets:
+ * -# -DAFTER_PAD_REM: Defines how much to rotate the vector if the backward calculation
+ * attempted to read from a negative offset (e.g. -DAFTER_PAD_REM=3)
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types:
+ * U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source image in Z dimension (in
+ * bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data
+ * types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in
+ * bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in
+ * bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination image in Z dimension (in
+ * bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * image
+ */
+__kernel void pad_layer_symmetric_reflect(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+ // Get current thread position
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+ const int z = get_global_id(2);
+
+ // Define conditions based on the thread X position w.r.t. pad left and right
+ const int x_out_first = x * VEC_SIZE;
+ const int x_out_last = x_out_first + VEC_SIZE;
+ const int is_before_pad_left = (x_out_last <= PAD_X_BEFORE);
+ const int is_across_pad_left = (x_out_first < PAD_X_BEFORE) && (x_out_last > PAD_X_BEFORE);
+ const int is_inside_input =
+ (x_out_first >= PAD_X_BEFORE) && (x_out_last <= (SRC_WIDTH + PAD_X_BEFORE));
+ const int is_across_pad_right =
+ (x_out_first < (SRC_WIDTH + PAD_X_BEFORE)) && (x_out_last > (SRC_WIDTH + PAD_X_BEFORE));
+ const int is_after_pad_right = (x_out_first >= (SRC_WIDTH + PAD_X_BEFORE));
+
+ // Calculate base pointers
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes;
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ // Calculate input tensor's offset based on the defined conditions
+ int x_offset = 0;
+ x_offset = select(x_offset, PAD_X_BEFORE - x_out_last + IS_REFLECT, is_before_pad_left);
+ x_offset = select(x_offset, x_out_first - PAD_X_BEFORE, is_inside_input);
+ x_offset = select(x_offset, SRC_WIDTH - VEC_SIZE, is_across_pad_right);
+ x_offset = select(x_offset, AFTER_PAD_FACT_X - x_out_last, is_after_pad_right);
+
+#if defined(AFTER_PAD_REM)
+ int neg_offs = x_offset < 0;
+ x_offset = max(x_offset, 0);
+#endif // defined(AFTER_PAD_REM)
+
+ // Load input values from the computed offset
+ int y_in = y;
+ int z_in = z;
+#if defined(PAD_Y_BEFORE)
+ y_in = select(y - PAD_Y_BEFORE, PAD_Y_BEFORE - y + IS_REFLECT - 1, y < PAD_Y_BEFORE);
+ y_in = select(y_in, 2 * SRC_HEIGHT + PAD_Y_BEFORE - y - IS_REFLECT - 1,
+ y >= (SRC_HEIGHT + PAD_Y_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+#if defined(PAD_Z_BEFORE)
+ z_in = select(z - PAD_Z_BEFORE, PAD_Z_BEFORE - z + IS_REFLECT - 1, z < PAD_Z_BEFORE);
+ z_in = select(z_in, 2 * SRC_DEPTH + PAD_Z_BEFORE - z - IS_REFLECT - 1,
+ z >= (SRC_DEPTH + PAD_Z_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+
+ src_addr += x_offset * src_stride_x + y_in * src_step_y + z_in * src_step_z;
+
+#if SRC_WIDTH == 1
+ VSTORE(VEC_SIZE)
+ ((VEC_TYPE)(*(__global DATA_TYPE *)src_addr), 0, (__global DATA_TYPE *)dst.ptr);
+#else // SRC_WIDTH == 1
+
+ VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+ // Choose rearrangement policy based on the defined conditions
+ src_vals =
+ select(src_vals, SYMM_REFL_LEFT(src_vals, PAD_X_BEFORE_REMAINDER, PAD_X_BEFORE_REMAINDER_REFL),
+ SCALAR_COND(is_across_pad_left));
+ src_vals =
+ select(src_vals, SYMM_REFL_RIGHT(src_vals, PAD_X_AFTER_REMAINDER, PAD_X_AFTER_REMAINDER_REFL),
+ SCALAR_COND(is_across_pad_right));
+ src_vals = select(src_vals, REVERSE(src_vals, VEC_SIZE),
+ SCALAR_COND((is_before_pad_left || is_after_pad_right)));
+#if defined(AFTER_PAD_REM)
+ src_vals = select(src_vals, ROTATE(src_vals, VEC_SIZE, AFTER_PAD_REM), SCALAR_COND(neg_offs));
+#endif // defined(AFTER_PAD_REM)
+
+ // Store
+ VSTORE(VEC_SIZE)
+ (src_vals, 0, (__global DATA_TYPE *)dst.ptr);
+#endif // SRC_WIDTH == 1
+}
+#endif // defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) &&
+ // defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) &&
+ // defined(AFTER_PAD_FACT_X)
+#endif // defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) &&
+ // defined(SRC_WIDTH)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
index 76fda9041..532000e9e 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
@@ -138,7 +138,7 @@ __kernel void pixelwise_mul_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARAT
// Multiply with a multiplier smaller than 1
out_val =
- ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
+ ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl
index 4ae9adb0b..c829f264d 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl
@@ -116,7 +116,7 @@ __kernel void quantization_symm8(IMAGE_DECLARATION(input), VECTOR_DECLARATION(sc
// Create scale vector
const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale =
- *(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1));
+ *(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1));
// Quantize
VEC_DATA_TYPE(int, VEC_SIZE)
@@ -127,10 +127,10 @@ __kernel void quantization_symm8(IMAGE_DECLARATION(input), VECTOR_DECLARATION(sc
(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
#else //! defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
*((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP(
- CONVERT_RTE((*(__global DATA_TYPE_IN *)input.ptr) /
- (*(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1))),
- int),
- MIN_QUANT_VAL, MAX_QUANT_VAL);
+ CONVERT_RTE((*(__global DATA_TYPE_IN *)input.ptr) /
+ (*(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1))),
+ int),
+ MIN_QUANT_VAL, MAX_QUANT_VAL);
#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
}
#endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
index 832ac1270..d0ef31b20 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
@@ -100,12 +100,14 @@ __kernel void reduce_min_max(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(o
Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
int indices[4] = {
- get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
- get_global_id(2) / DEPTH_OUT,
+ get_global_id(0),
+ get_global_id(1),
+ get_global_id(2) % DEPTH_OUT,
+ get_global_id(2) / DEPTH_OUT,
};
DATA_TYPE value =
- *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+ *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
for (int i = 1; i < dim; ++i)
{
indices[axis] = i;
@@ -186,16 +188,18 @@ __kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(
Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
int indices[4] = {
- get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
- get_global_id(2) / DEPTH_OUT,
+ get_global_id(0),
+ get_global_id(1),
+ get_global_id(2) % DEPTH_OUT,
+ get_global_id(2) / DEPTH_OUT,
};
DATA_TYPE sum_value = (DATA_TYPE)0;
for (int i = 0; i < dim; ++i)
{
indices[axis] = i;
- sum_value += *(
- (__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+ sum_value +=
+ *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
}
#if OP_CODE == 3 // REDUCE_SUM
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h
new file mode 100644
index 000000000..cfc811cce
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_REPEAT_H
+#define ARM_COMPUTE_REPEAT_H
+
+#include "helpers.h"
+
+/** Macros that help in loop unrolling */
+// Repeat macros with 3 param, excluding the implicit ID param
+#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C)
+#define REPEAT_3_2(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(1, P_A, P_B, P_C); \
+ REPEAT_3_1(P_X, P_A, P_B, P_C)
+#define REPEAT_3_3(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(2, P_A, P_B, P_C); \
+ REPEAT_3_2(P_X, P_A, P_B, P_C)
+#define REPEAT_3_4(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(3, P_A, P_B, P_C); \
+ REPEAT_3_3(P_X, P_A, P_B, P_C)
+#define REPEAT_3_5(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(4, P_A, P_B, P_C); \
+ REPEAT_3_4(P_X, P_A, P_B, P_C)
+#define REPEAT_3_6(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(5, P_A, P_B, P_C); \
+ REPEAT_3_5(P_X, P_A, P_B, P_C)
+#define REPEAT_3_7(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(6, P_A, P_B, P_C); \
+ REPEAT_3_6(P_X, P_A, P_B, P_C)
+#define REPEAT_3_8(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(7, P_A, P_B, P_C); \
+ REPEAT_3_7(P_X, P_A, P_B, P_C)
+#define REPEAT_3_9(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(8, P_A, P_B, P_C); \
+ REPEAT_3_8(P_X, P_A, P_B, P_C)
+#define REPEAT_3_10(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(9, P_A, P_B, P_C); \
+ REPEAT_3_9(P_X, P_A, P_B, P_C)
+#define REPEAT_3_11(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(A, P_A, P_B, P_C); \
+ REPEAT_3_10(P_X, P_A, P_B, P_C)
+#define REPEAT_3_12(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(B, P_A, P_B, P_C); \
+ REPEAT_3_11(P_X, P_A, P_B, P_C)
+#define REPEAT_3_13(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(C, P_A, P_B, P_C); \
+ REPEAT_3_12(P_X, P_A, P_B, P_C)
+#define REPEAT_3_14(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(D, P_A, P_B, P_C); \
+ REPEAT_3_13(P_X, P_A, P_B, P_C)
+#define REPEAT_3_15(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(E, P_A, P_B, P_C); \
+ REPEAT_3_14(P_X, P_A, P_B, P_C)
+#define REPEAT_3_16(P_X, P_A, P_B, P_C) \
+ P_X##_DEF(F, P_A, P_B, P_C); \
+ REPEAT_3_15(P_X, P_A, P_B, P_C)
+
+#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) \
+ REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) // One level of indirection to ensure order of expansion
+ // does not affect preprocessing P_NUM
+#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
+
+// Repeat macros with 4 param, excluding the implicit ID param
+#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D)
+#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \
+ P_X##_DEF(1, P_A, P_B, P_C, P_D); \
+ REPEAT_4_1(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \
+ P_X##_DEF(2, P_A, P_B, P_C, P_D); \
+ REPEAT_4_2(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \
+ P_X##_DEF(3, P_A, P_B, P_C, P_D); \
+ REPEAT_4_3(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \
+ P_X##_DEF(4, P_A, P_B, P_C, P_D); \
+ REPEAT_4_4(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \
+ P_X##_DEF(5, P_A, P_B, P_C, P_D); \
+ REPEAT_4_5(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \
+ P_X##_DEF(6, P_A, P_B, P_C, P_D); \
+ REPEAT_4_6(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \
+ P_X##_DEF(7, P_A, P_B, P_C, P_D); \
+ REPEAT_4_7(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \
+ P_X##_DEF(8, P_A, P_B, P_C, P_D); \
+ REPEAT_4_8(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \
+ P_X##_DEF(9, P_A, P_B, P_C, P_D); \
+ REPEAT_4_9(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \
+ P_X##_DEF(A, P_A, P_B, P_C, P_D); \
+ REPEAT_4_10(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \
+ P_X##_DEF(B, P_A, P_B, P_C, P_D); \
+ REPEAT_4_11(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \
+ P_X##_DEF(C, P_A, P_B, P_C, P_D); \
+ REPEAT_4_12(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \
+ P_X##_DEF(D, P_A, P_B, P_C, P_D); \
+ REPEAT_4_13(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \
+ P_X##_DEF(E, P_A, P_B, P_C, P_D); \
+ REPEAT_4_14(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \
+ P_X##_DEF(F, P_A, P_B, P_C, P_D); \
+ REPEAT_4_15(P_X, P_A, P_B, P_C, P_D)
+
+#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) \
+ REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) // One level of indirection to ensure order of
+ // expansion does not affect preprocessing P_NUM
+#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D)
+
+// Macro for initializing N variables. Generates N statements that defines VAR##N =
+// RHS_ACCESSOR_DEF(...)
+#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
+#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
+
+// Macro for initializing N variables by converting the data type. Generates N statements that
+// defines VAR##N = RHS_ACCESSOR_DEF(...)
+#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) \
+ TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
+#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) \
+ REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
+
+// Macro for adding a constant to N variables. Generates N statements that defines VAR##N
+// =RHS_ACCESSOR_DEF(...)
+#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
+#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL)
+
+// Macro for multiplying N variables (VAR_B) by a constant (VAL) and adding to other N variables
+// (VAR_A). Generates N statements that defines VAR_A##N =RHS_ACCESSOR_DEF(...)
+#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
+#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) \
+ REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL)
+
+// Macro for adding a vector to N-variables. Generates N statements that defines VAR##N
+// =RHS_ACCESSOR_DEF(...)
+#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC
+#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
+
+// Macro for adding a two N-variables. Generates N statements that defines VAR##N
+// =RHS_ACCESSOR_DEF(...)
+#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID
+#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
+
+// Macro for performing Max between a constant and N variables. Generates N statements that defines
+// VAR##N =RHS_ACCESSOR_DEF(...)
+#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
+#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL)
+
+// Macro for performing Min between a constant and N variables. Generates N statements that defines
+// VAR##N =RHS_ACCESSOR_DEF(...)
+#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
+#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL)
+
+// Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE to N variables. Generates N
+// statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+ VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+ REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+
+// Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE to N variables. Generates N
+// statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+ VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+ REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+
+// Macro for performing per-channel ASYMM_MULT_BY_QUANT_MULTIPLIER to N variables.
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+ ({ \
+ VEC_DATA_TYPE(int, N0) \
+ VAR##ID_shift_lt0 = \
+ ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \
+ VEC_DATA_TYPE(int, N0) \
+ VAR##ID_shift_gt0 = \
+ ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \
+ VAR##ID = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0); \
+ })
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+ REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
+
+#endif // ARM_COMPUTE_REPEAT_H
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl
new file mode 100644
index 000000000..8da8bfc8e
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform tensor reshape
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ * -DDATA_TYPE=short
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported
+ * data types: All
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension
+ * (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension
+ * (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension
+ * (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first
+ * source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported
+ * data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ * @param[in] input_shape Input spatial shape
+ * @param[in] output_shape Output spatial shape
+ */
+__kernel void reshape_layer(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output),
+ int2 input_shape, int2 output_shape)
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+ int3 id = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+
+ // Linearize index
+ int linear_idx = id.x + id.y * input_shape.x + id.z * input_shape.x * input_shape.y;
+
+ // Translate to output
+ int3 out_id;
+ out_id.x = linear_idx % output_shape.x;
+ out_id.y = (linear_idx / output_shape.x) % output_shape.y;
+ out_id.z = linear_idx / (output_shape.x * output_shape.y);
+
+ // Store result
+ *((__global DATA_TYPE *)tensor3D_offset(&out, out_id.x, out_id.y, out_id.z)) =
+ *((__global DATA_TYPE *)in.ptr);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
deleted file mode 100644
index e9d4696b4..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
+++ /dev/null
@@ -1,292 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// reference:
-// https://code.google.com/archive/p/ocl-radix-sort/source/default/source
-// OpenCL kernel sources for the CLRadixSort class
-// the #include does not exist in OpenCL
-// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr
-// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html
-// if you find this software usefull you can cite the following work in your reports or articles:
-// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011.
-// http://hal.archives-ouvertes.fr/hal-00596730
-
-// Reference for floating point radix sort:
-// http://www.codercorner.com/RadixSortRevisited.htm
-
-// compute the histogram for each radix and each virtual processor for the pass
-__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms,
- const int pass, __local int *loc_histo, const int n)
-{
- int it = get_local_id(0); // i local number of the processor
- int ig = get_global_id(0); // global number = i + g I
-
- int gr = get_group_id(0); // g group number
-
- int groups = get_num_groups(0);
- int items = get_local_size(0);
-
- // set the local histograms to zero
- for (int ir = 0; ir < _RADIX; ir++)
- {
- loc_histo[ir * items + it] = 0;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // range of keys that are analyzed by the work item
- int size = n / groups / items; // size of the sub-list
- int start = ig * size; // beginning of the sub-list
-
- unsigned int key;
- int shortkey, k;
-
- // compute the index
- // the computation depends on the transposition
- for (int j = 0; j < size; j++)
- {
-#ifdef TRANSPOSE
- k = groups * items * j + ig;
-#else
- k = j + start;
-#endif
-
- key = *((__global unsigned int *)(in_key_buf + k));
-
- // extract the group of _BITS bits of the pass
- // the result is in the range 0.._RADIX-1
- shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
-
- // increment the local histogram
- loc_histo[shortkey * items + it]++;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // copy the local histogram to the global one
- for (int ir = 0; ir < _RADIX; ir++)
- {
- d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it];
- }
-
- barrier(CLK_GLOBAL_MEM_FENCE);
-}
-
-// initial transpose of the list for improving
-// coalescent memory access
-__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol,
- const int nbrow, const __global int *inperm, __global int *outperm,
- __local int *blockmat, __local int *blockperm, const int tilesize)
-{
-
- int i0 = get_global_id(0) * tilesize; // first row index
- int j = get_global_id(1); // column index
-
- int jloc = get_local_id(1); // local column index
-
- // fill the cache
- for (int iloc = 0; iloc < tilesize; iloc++)
- {
- int k = (i0 + iloc) * nbcol + j; // position in the matrix
- blockmat[iloc * tilesize + jloc] = invect[k];
-#ifdef PERMUT
- blockperm[iloc * tilesize + jloc] = inperm[k];
-#endif
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // first row index in the transpose
- int j0 = get_group_id(1) * tilesize;
-
- // put the cache at the good place
- for (int iloc = 0; iloc < tilesize; iloc++)
- {
- int kt = (j0 + iloc) * nbrow + i0 + jloc; // position in the transpose
- outvect[kt] = blockmat[jloc * tilesize + iloc];
-#ifdef PERMUT
- outperm[kt] = blockperm[jloc * tilesize + iloc];
-#endif
- }
-}
-
-// each virtual processor reorders its data using the scanned histogram
-__kernel void radixsort_reorder(__global float *in_key, __global float *out_key,
- __global int *d_Histograms, const int pass,
- __global int *indices_in, __global int *indices_out,
- __local int *loc_histo, const int n)
-{
-
- int it = get_local_id(0);
- int ig = get_global_id(0);
-
- int gr = get_group_id(0);
- int groups = get_num_groups(0);
- int items = get_local_size(0);
-
- int start = ig * (n / groups / items);
- int size = n / groups / items;
-
- // take the histogram in the cache
- for (int ir = 0; ir < _RADIX; ir++)
- {
- loc_histo[ir * items + it] = d_Histograms[items * (ir * groups + gr) + it];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int newpos, shortkey, k, newpost;
- unsigned int key;
-
- for (int j = 0; j < size; j++)
- {
-#ifdef TRANSPOSE
- k = groups * items * j + ig;
-#else
- k = j + start;
-#endif
- float org_value = in_key[k];
- key = *(__global unsigned int *)(in_key + k);
- shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
-
- newpos = loc_histo[shortkey * items + it];
-
-#ifdef TRANSPOSE
- int ignew, jnew;
- ignew = newpos / (n / groups / items);
- jnew = newpos % (n / groups / items);
- newpost = jnew * (groups * items) + ignew;
-#else
- newpost = newpos;
-#endif
-
- // d_outKeys[newpost]= key; // killing line !!!
- out_key[newpost] = org_value;
-
-#ifdef PERMUT
- indices_out[newpost] = indices_in[k];
-#endif
-
- newpos++;
- loc_histo[shortkey * items + it] = newpos;
- }
-}
-
-// perform a parallel prefix sum (a scan) on the local histograms
-// (see Blelloch 1990) each workitem worries about two memories
-// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
-__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp,
- __global int *globsum)
-{
- int it = get_local_id(0);
- int ig = get_global_id(0);
- int decale = 1;
- int n = get_local_size(0) * 2;
- int gr = get_group_id(0);
-
- // load input into local memory
- // up sweep phase
- temp[2 * it] = histo[2 * ig];
- temp[2 * it + 1] = histo[2 * ig + 1];
-
- // parallel prefix sum (algorithm of Blelloch 1990)
- for (int d = n >> 1; d > 0; d >>= 1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- if (it < d)
- {
- int ai = decale * (2 * it + 1) - 1;
- int bi = decale * (2 * it + 2) - 1;
- temp[bi] += temp[ai];
- }
- decale *= 2;
- }
-
- // store the last element in the global sum vector
- // (maybe used in the next step for constructing the global scan)
- // clear the last element
- if (it == 0)
- {
- globsum[gr] = temp[n - 1];
- temp[n - 1] = 0;
- }
-
- // down sweep phase
- for (int d = 1; d < n; d *= 2)
- {
- decale >>= 1;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (it < d)
- {
- int ai = decale * (2 * it + 1) - 1;
- int bi = decale * (2 * it + 2) - 1;
-
- int t = temp[ai];
- temp[ai] = temp[bi];
- temp[bi] += t;
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // write results to device memory
-
- histo[2 * ig] = temp[2 * it];
- histo[2 * ig + 1] = temp[2 * it + 1];
-
- barrier(CLK_GLOBAL_MEM_FENCE);
-}
-
-// use the global sum for updating the local histograms
-// each work item updates two values
-__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum)
-{
- int ig = get_global_id(0);
- int gr = get_group_id(0);
-
- int s;
-
- s = globsum[gr];
-
- // write results to device memory
- histo[2 * ig] += s;
- histo[2 * ig + 1] += s;
-
- barrier(CLK_GLOBAL_MEM_FENCE);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
new file mode 100644
index 000000000..987409739
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+constexpr unsigned int vector_size = 16;
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_output,
+ const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX &&
+ op != ReductionOperation::ARG_IDX_MIN,
+ "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+ "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32,
+ DataType::S64);
+ }
+ if (prev_output != nullptr && prev_output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(prev_output, 1, DataType::U32,
+ DataType::S32, DataType::S64);
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(prev_output, output);
+ }
+ }
+
+ return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input,
+ ITensorInfo *prev_output,
+ ITensorInfo *output, unsigned int axis,
+ ReductionOperation op)
+{
+ ARM_COMPUTE_UNUSED(op);
+ // Output tensor auto initialization if not yet initialized
+ TensorShape output_shape{input->tensor_shape()};
+ output_shape.set(axis, 1);
+ DataType output_data_type = (prev_output != nullptr) ? (prev_output->data_type()) : DataType::S32;
+ auto_init_if_empty(*output, input->clone()
+ ->set_tensor_shape(output_shape)
+ .set_data_type(output_data_type)
+ .reset_padding()
+ .set_is_resizable(true));
+
+ Window win =
+ calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input), Steps(vector_size));
+ bool window_changed = false;
+
+ switch (axis)
+ {
+ case 0:
+ {
+ ITensorInfo *input_tensor_access = prev_output != nullptr ? prev_output : input;
+ AccessWindowStatic input_access(input_tensor_access, 0, 0,
+ static_cast<int>(input_tensor_access->dimension(0)), 1);
+ AccessWindowHorizontal output_access(output, 0, 1);
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ }
+ break;
+ case 1:
+ case 2:
+ case 3:
+ {
+ AccessWindowHorizontal input_access(input, 0, vector_size);
+ AccessWindowHorizontal output_access(output, 0, vector_size);
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_tuple(err, win);
+}
+} // namespace
+
+CLArgMinMaxLayerKernelEx::CLArgMinMaxLayerKernelEx()
+ : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0),
+ _op(ReductionOperation::ARG_IDX_MAX)
+{
+}
+
+void CLArgMinMaxLayerKernelEx::configure(const ICLTensor *input, const ICLTensor *prev_output,
+ ICLTensor *output, unsigned int axis,
+ ReductionOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr,
+ output->info(), axis, op));
+ auto win_config = validate_and_configure_window(
+ input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis,
+ op);
+ ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+ _input = input;
+ _prev_output = prev_output;
+ _output = output;
+ _reduction_axis = axis;
+ _op = op;
+
+ // Set build options
+ CLBuildOptions build_opts;
+
+ build_opts.add_option_if(_prev_output != nullptr, "-DPREV_OUTPUT");
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
+ build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN");
+ build_opts.add_option("-DDATA_TYPE_OUTPUT=" +
+ get_cl_type_from_data_type(output->info()->data_type()));
+ build_opts.add_option("-DDATA_TYPE_SELECT=" +
+ get_cl_signed_type_from_element_size(input->info()->element_size()));
+
+ // Create kernel
+ cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
+ std::string kernel_axis_name;
+ switch (axis)
+ {
+ case 0:
+ {
+ const ICLTensor *input_for_width = prev_output != nullptr ? _prev_output : _input;
+ build_opts.add_option("-DWIDTH=" +
+ support::cpp11::to_string(input_for_width->info()->dimension(0)));
+
+ kernel_axis_name = "x";
+ lws_hint = create_lws_hint_parallel_implementations(input_for_width->info()->dimension(0),
+ vector_size);
+ }
+ break;
+ case 1:
+ build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+ kernel_axis_name = "y";
+ break;
+ case 2:
+ build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+ kernel_axis_name = "z";
+ break;
+ case 3:
+ build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+ build_opts.add_option("-DBATCH=" + support::cpp11::to_string(input->info()->dimension(3)));
+ kernel_axis_name = "w";
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+ _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
+ "arg_min_max_ex_" + kernel_axis_name, build_opts.options()));
+
+ // Configure kernel window
+ ICLKernel::configure_internal(std::get<1>(win_config), lws_hint);
+}
+
+Status CLArgMinMaxLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *prev_output,
+ const ITensorInfo *output, unsigned int axis,
+ ReductionOperation op)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, prev_output, output, axis, op));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
+ input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr,
+ output->clone().get(), axis, op)));
+ return Status{};
+}
+
+void CLArgMinMaxLayerKernelEx::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ switch (_reduction_axis)
+ {
+ case 0:
+ {
+ // Set out window
+ Window out_window(window);
+ out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ // Get first input and output slices
+ Window in_slice = window.first_slice_window_2D();
+ Window out_slice = out_window.first_slice_window_2D();
+
+ // Reshape window
+ const unsigned int num_tensors = _prev_output != nullptr ? 3 : 2;
+
+ // Set local sums buffer
+ unsigned int local_res_size = lws_hint()[0] * _output->info()->element_size();
+ _kernel.setArg(num_arguments_per_2D_tensor() * num_tensors, local_res_size, nullptr);
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, in_slice);
+ if (_prev_output != nullptr)
+ {
+ add_2D_tensor_argument(idx, _prev_output, in_slice);
+ }
+ add_2D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice, lws_hint());
+ } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ }
+ break;
+ case 1:
+ {
+ // Get first input and output slices
+ Window window_in{window};
+ window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1),
+ _input->info()->dimension(1)));
+ Window in_slice = window_in.first_slice_window_2D();
+ Window out_slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, in_slice);
+ add_2D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice, lws_hint());
+ } while (window_in.slide_window_slice_2D(in_slice) &&
+ window.slide_window_slice_2D(out_slice));
+ }
+ break;
+ case 2:
+ {
+ // Get first input and output slices
+ Window window_in{window};
+ window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2),
+ _input->info()->dimension(2)));
+ Window in_slice = window_in.first_slice_window_3D();
+ Window out_slice = window.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, in_slice);
+ add_3D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice, lws_hint());
+ } while (window_in.slide_window_slice_3D(in_slice) &&
+ window.slide_window_slice_3D(out_slice));
+ }
+ break;
+ case 3:
+ {
+ // Get first input and output slices
+ Window window_in{window};
+ window_in.set(3, Window::Dimension(0, 1, 1));
+ Window in_slice = window_in.first_slice_window_4D();
+ Window out_slice = window.first_slice_window_4D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, in_slice);
+ add_4D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice, lws_hint());
+ } while (window_in.slide_window_slice_4D(in_slice) &&
+ window.slide_window_slice_4D(out_slice));
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
index fbc76f5e1..a5daa2410 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -43,6 +43,8 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/AccessWindowStatic.h"
#include "support/StringSupport.h"
using namespace arm_compute;
@@ -55,7 +57,7 @@ Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
const ITensorInfo *output)
{
const TensorShape &out_shape =
- TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+ TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8);
@@ -68,15 +70,15 @@ Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8,
DataType::QASYMM8);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
- "Wrong shape for output");
+ detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+ "Wrong shape for output");
}
return Status{};
}
} // namespace
CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
{
}
@@ -111,13 +113,13 @@ void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor
build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
build_opts.emplace(
- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
_kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
const std::pair<TensorShape, ValidRegion> broadcast_pair =
- ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+ ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
const ValidRegion &valid_region = broadcast_pair.second;
@@ -130,8 +132,8 @@ void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
update_window_and_padding(win_input1, input1_access) ||
- update_window_and_padding(win_input2, input2_access) ||
- update_window_and_padding(win, output_access);
+ update_window_and_padding(win_input2, input2_access) ||
+ update_window_and_padding(win, output_access);
output_access.set_valid_region(win, valid_region);
@@ -151,7 +153,7 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
{
can_collapse =
- (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+ (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
{
can_collapse = (in_shape1[d] == in_shape2[d]);
@@ -160,13 +162,13 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
bool has_collapsed = false;
Window collapsed =
- can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
- : window;
+ can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+ : window;
const TensorShape &in_shape1_collapsed =
- has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+ has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
const TensorShape &in_shape2_collapsed =
- has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+ has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
Window slice = collapsed.first_slice_window_3D();
Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
@@ -189,9 +191,9 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
BorderSize CLBinaryLogicalOpKernel::border_size() const
{
const unsigned int replicateSize =
- _output->info()->dimension(0) -
- std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+ _output->info()->dimension(0) -
+ std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
const unsigned int border =
- std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+ std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
return BorderSize(0, border, 0, 0);
}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
new file mode 100644
index 000000000..dc06bfbb3
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLCastBoolKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
+#include "support/StringSupport.h"
+
+#include <cstddef>
+#include <set>
+#include <string>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input == output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8,
+ DataType::S16, DataType::U16, DataType::U32,
+ DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == output->data_type(),
+ "Input and output data types must be different");
+
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
+void CLCastBoolKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype
+ // must be given)
+ set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+ // Get number of elements to process per iterations
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DVEC_SIZE=" +
+ support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DDATA_TYPE_OUT=" +
+ get_cl_type_from_data_type(output->info()->data_type()));
+
+ // Create kernel
+ const std::string kernel_name = "cast_bool";
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
+
+ // Configure kernel
+ ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+
+ // Collapse window
+ const Window &full_window = window();
+ Window collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ);
+ ICLKernel::configure_internal(collapsed_window);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = kernel_name;
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(output->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+Status CLCastBoolKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+
+ return Status{};
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
index 67aaf2db6..4206f1fd4 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -43,6 +43,9 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+
#include "support/StringSupport.h"
using namespace arm_compute;
@@ -61,14 +64,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
input_access.set_valid_region(win, output->valid_region());
Status err = (window_changed)
- ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
- : Status{};
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
return std::make_pair(err, win);
}
} // namespace
CLEmbeddingLookupKernel::CLEmbeddingLookupKernel()
- : _input(nullptr), _output(nullptr), _lookups(nullptr)
+ : _input(nullptr), _output(nullptr), _lookups(nullptr)
{
}
@@ -77,8 +80,8 @@ Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensor
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
- input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
- DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -108,8 +111,8 @@ void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *outpu
build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
// Create kernel
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
// Configure kernel window
auto win_config = validate_and_configure_window(input->info(), output->info());
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
index 3bfe3e407..62da2376e 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
@@ -45,6 +45,10 @@
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
#include "arm_compute/core/UtilsEx.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
#include "support/StringSupport.h"
using namespace arm_compute;
@@ -62,15 +66,15 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in
ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
- input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
- DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
- input->tensor_shape(), indices->tensor_shape(), actual_axis);
+ input->tensor_shape(), indices->tensor_shape(), actual_axis);
ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
}
@@ -86,7 +90,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
std::unique_ptr<ITensorInfo> output_info = input->clone();
output_info->set_tensor_shape(arm_compute::misc::shape_calculator::compute_gather_shape_ex(
- input->tensor_shape(), indices->tensor_shape(), actual_axis));
+ input->tensor_shape(), indices->tensor_shape(), actual_axis));
// Output auto initialization if not yet initialized
auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type());
@@ -100,7 +104,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
} // namespace
CLGatherExKernel::CLGatherExKernel()
- : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
+ : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
{
}
@@ -109,11 +113,11 @@ void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indice
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
ARM_COMPUTE_ERROR_THROW_ON(
- validate_arguments(input->info(), indices->info(), output->info(), axis));
+ validate_arguments(input->info(), indices->info(), output->info(), axis));
// Configure kernel window
auto win_config =
- validate_and_configure_window(input->info(), indices->info(), output->info(), axis);
+ validate_and_configure_window(input->info(), indices->info(), output->info(), axis);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
_input = input;
@@ -133,7 +137,7 @@ void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indice
// Create kernel
_kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options()));
+ CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options()));
ICLKernel::configure_internal(win_config.second);
}
@@ -144,7 +148,7 @@ Status CLGatherExKernel::validate(const ITensorInfo *input, const ITensorInfo *i
ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
indices->clone().get(),
output->clone().get(), axis)
- .first);
+ .first);
return Status{};
}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
index 930e7c944..03ca6ddcb 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -43,6 +43,7 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
using namespace arm_compute;
@@ -61,8 +62,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
input_access.set_valid_region(win, output->valid_region());
Status err = (window_changed)
- ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
- : Status{};
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
return std::make_pair(err, win);
}
} // namespace
@@ -78,8 +79,8 @@ Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITens
{
ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
- input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
- DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
@@ -102,7 +103,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
{
ARM_COMPUTE_ERROR_THROW_ON(
- validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
+ validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
_lookups = lookups;
_keys = keys;
@@ -111,9 +112,9 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
_hits = hits;
// Make _lookup_indices tensor
- _lookup_indices = support::cpp14::make_unique<CLTensor>();
+ _lookup_indices = std::make_unique<CLTensor>();
_lookup_indices->allocator()->init(
- TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
+ TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
_lookup_indices->allocator()->allocate();
// Set kernel build options
@@ -127,8 +128,8 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
// Create kernel
- _kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
// Configure kernel window
auto win_config = validate_and_configure_window(input->info(), output->info());
@@ -148,7 +149,7 @@ void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue)
// Set values of hits
const int32_t *lookups_buf =
- reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
+ reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer());
uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer());
int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer());
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
index 61c14d271..945af3c51 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
@@ -42,12 +42,16 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Window.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
#include "support/StringSupport.h"
#include "support/ToolchainSupport.h"
@@ -94,8 +98,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
} // namespace
CLInstanceNormalizationLayerKernelEx::CLInstanceNormalizationLayerKernelEx()
- : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12),
- _run_in_place(false)
+ : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12),
+ _run_in_place(false)
{
}
@@ -132,7 +136,7 @@ void CLInstanceNormalizationLayerKernelEx::configure(ICLTensor *input, ICLTensor
// Create kernel
_kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options()));
+ CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options()));
// Configure kernel window
auto win_config = validate_and_configure_window(_input->info(), _output->info());
@@ -147,7 +151,7 @@ Status CLInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input,
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
- input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+ input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
return Status{};
}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp
new file mode 100644
index 000000000..a00fc5e2e
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+CLMemsetKernel::CLMemsetKernel() : ICLKernel(), _tensor(nullptr), _full_window() {}
+
+void CLMemsetKernel::configure(ICLTensor *tensor, const PixelValue &constant_value, Window *window)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, window);
+}
+
+void CLMemsetKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor,
+ const PixelValue &constant_value, Window *window)
+{
+ ARM_COMPUTE_UNUSED(compile_context);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(tensor->info(), constant_value, window));
+
+ _tensor = tensor;
+
+ const DataType data_type = tensor->info()->data_type();
+ const int vec_size_x = 16 / tensor->info()->element_size();
+
+ // Create and update the window (if needed)
+ _full_window = calculate_max_window(*tensor->info());
+ Window win = _full_window;
+ if (window != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window);
+ win = *window;
+ }
+
+ const int output_width_x = win.num_iterations(0);
+ const bool multi_access_x = output_width_x >= vec_size_x;
+ const bool remainder_x = output_width_x % vec_size_x > 0;
+
+ if (multi_access_x)
+ {
+ win.set(
+ Window::DimX,
+ Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+ }
+ ICLKernel::configure_internal(win);
+
+ // Create kernel
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type));
+ build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option_if(multi_access_x && remainder_x,
+ "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+ std::max<int>(output_width_x - vec_size_x, 0)));
+
+ _kernel =
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("memset", build_opts.options()));
+}
+
+Status CLMemsetKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value,
+ Window *window)
+{
+ ARM_COMPUTE_UNUSED(tensor);
+ ARM_COMPUTE_UNUSED(constant_value);
+ if (window != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1);
+ }
+ return Status{};
+}
+
+void CLMemsetKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ // Collapse all the batches on the third
+ Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _tensor, slice);
+ enqueue(queue, *this, slice, lws_hint());
+ } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
index 6b27c9917..da7437e97 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
@@ -40,15 +40,19 @@
#include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h"
-#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
#include "support/StringSupport.h"
using namespace arm_compute;
@@ -99,7 +103,7 @@ std::tuple<Status, Window> validate_and_configure_window(const ITensorInfo *inpu
} // namespace
CLMultiplyScaleFactorKernel::CLMultiplyScaleFactorKernel()
- : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
+ : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
{
}
@@ -108,7 +112,7 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(
- validate_arguments(input->info(), scale_factor->info(), output->info()));
+ validate_arguments(input->info(), scale_factor->info(), output->info()));
_input = input;
_scale_factor = scale_factor;
@@ -123,9 +127,9 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen
Window win = calculate_max_window(*output->info());
if (multi_access_x)
{
- win.set(Window::DimX,
- Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x),
- vec_size_x));
+ win.set(
+ Window::DimX,
+ Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
}
ICLKernel::configure_internal(win);
@@ -134,11 +138,11 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen
build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
build_opts.add_option_if(
- multi_access_x, "-DLAST_ACCESSED_X=" +
- support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+ multi_access_x, "-DLAST_ACCESSED_X=" +
+ support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
_kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("multiply_scale_factor", build_opts.options()));
+ CLKernelLibraryEx::get().create_kernel("multiply_scale_factor", build_opts.options()));
}
Status CLMultiplyScaleFactorKernel::validate(const ITensorInfo *input,
@@ -147,7 +151,7 @@ Status CLMultiplyScaleFactorKernel::validate(const ITensorInfo *input,
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output));
ARM_COMPUTE_RETURN_ON_ERROR(
- std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+ std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
return Status{};
}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
index 643c8b110..cd5e571e9 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -43,6 +43,9 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+
#include "support/StringSupport.h"
using namespace arm_compute;
@@ -80,9 +83,9 @@ void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output)
std::set<std::string> build_opts;
build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
build_opts.emplace(
- ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+ ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
_kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
// Configure window
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
new file mode 100644
index 000000000..4c4cbe710
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
+#include "support/StringSupport.h"
+#include <string>
+namespace arm_compute
+{
+namespace
+{
+inline Status validate_arguments(const ITensorInfo *indices, const ITensorInfo *on_value,
+ const ITensorInfo *output, int depth, int axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(indices, on_value, output);
+ const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions()));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(on_value->tensor_shape().total_size() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(depth <= 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= output->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON(on_value->data_type() == DataType::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(on_value, 1, DataType::U8, DataType::S8,
+ DataType::U16, DataType::S16, DataType::F16,
+ DataType::U32, DataType::S32, DataType::F32);
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex(
+ indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+ }
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *indices,
+ const ITensorInfo *on_value,
+ ITensorInfo *output, int depth, int axis)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, output, indices);
+ const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions()));
+ // Output auto initialization if not yet initialized
+ TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex(
+ indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis);
+ auto_init_if_empty((*output), output_shape, 1, on_value->data_type());
+ // Create window
+ Window win = calculate_max_window(*output, Steps());
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+ return std::make_pair(Status{}, win);
+}
+} // namespace
+CLOneHotKernel::CLOneHotKernel()
+ : _indices(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr),
+ _is_off_value_memset(false)
+{
+}
+void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value,
+ const ICLTensor *off_value, ICLTensor *output, int depth, int axis)
+{
+ _is_off_value_memset = false;
+ ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, off_value, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(off_value->info());
+ ARM_COMPUTE_ERROR_ON(off_value->info()->tensor_shape().total_size() != 1);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value);
+ _off_value = off_value;
+ configure_common(indices, on_value, output, depth, axis);
+}
+void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value,
+ ICLTensor *output, int depth, int axis)
+{
+ _is_off_value_memset = true;
+ ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, output);
+ configure_common(indices, on_value, output, depth, axis);
+}
+void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor *on_value,
+ ICLTensor *output, int depth, int axis)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(indices->info(), on_value->info(), output->info(), depth, axis));
+ // Configure kernel window
+ auto win_config =
+ validate_and_configure_window(indices->info(), on_value->info(), output->info(), depth, axis);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ if (_is_off_value_memset)
+ {
+ // Replace window with calculated by infices info
+ win_config.second = calculate_max_window(*indices->info(), Steps());
+ }
+ _indices = indices;
+ _on_value = on_value;
+ _output = output;
+ const auto actual_axis = wrap_around(axis, static_cast<int>(output->info()->num_dimensions()));
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(
+ data_size_from_type(on_value->info()->data_type())));
+ build_opts.add_option("-DAXIS=" + support::cpp11::to_string(actual_axis));
+ build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(depth));
+ build_opts.add_option("-DOUTPUT_DIM_Z=" +
+ support::cpp11::to_string(output->info()->dimension(2)));
+ // Create kernel
+ const std::string kernel_name = _is_off_value_memset ? "one_hot_only_on_value" : "one_hot";
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
+ ICLKernel::configure_internal(win_config.second);
+}
+Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+ const ITensorInfo *off_value, const ITensorInfo *output, int depth,
+ int axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(off_value);
+ ARM_COMPUTE_RETURN_ERROR_ON(off_value->tensor_shape().total_size() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(indices, on_value, output, depth, axis));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(),
+ on_value->clone().get(),
+ output->clone().get(), depth, axis)
+ .first);
+ return Status{};
+}
+Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+ const ITensorInfo *output, int depth, int axis)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(indices, on_value, output, depth, axis));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(),
+ on_value->clone().get(),
+ output->clone().get(), depth, axis)
+ .first);
+ return Status{};
+}
+void CLOneHotKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _indices, window_collapsed);
+ add_1D_tensor_argument(idx, _on_value, window_collapsed);
+ if (!_is_off_value_memset)
+ {
+ add_1D_tensor_argument(idx, _off_value, window_collapsed);
+ }
+ add_4D_tensor_argument(idx, _output, window_collapsed);
+ enqueue(queue, *this, window_collapsed, lws_hint());
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp
new file mode 100644
index 000000000..b6efeac35
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h"
+
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+ const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_UNUSED(constant_value);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > input->num_dimensions());
+ if (mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3);
+
+ const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
+ for (size_t i = 0; i < padding.size(); ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect));
+ ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect));
+ }
+ }
+
+ if (output->total_size() > 0)
+ {
+ TensorShape padded_shape =
+ misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PaddingList &padding,
+ PixelValue constant_value, PaddingMode mode,
+ unsigned int &num_elems_processed_per_iteration)
+{
+ ARM_COMPUTE_UNUSED(constant_value, mode);
+
+ const TensorShape padded_shape =
+ misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(padded_shape));
+
+ num_elems_processed_per_iteration =
+ std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->data_type())));
+ if (input->dimension(0) < num_elems_processed_per_iteration)
+ {
+ num_elems_processed_per_iteration =
+ 1 << static_cast<unsigned int>(std::log2(input->dimension(0)));
+ }
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+ const int input_start_x =
+ mode == PaddingMode::CONSTANT ? -(padding.at(0).first % num_elems_processed_per_iteration) : 0;
+ const int input_start_y =
+ (mode == PaddingMode::CONSTANT && padding.size() > 1) ? -padding.at(1).first : 0;
+
+ AccessWindowRectangle input_access(input, input_start_x, input_start_y,
+ num_elems_processed_per_iteration, 1);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ const bool window_changed = update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLPadLayerKernelEx::CLPadLayerKernelEx()
+ : _input(nullptr), _output(nullptr), _input_start_x(0), _input_start_y(0), _4d_enabled(false)
+{
+}
+
+void CLPadLayerKernelEx::configure(const ICLTensor *input, ICLTensor *output,
+ const PaddingList &padding, PixelValue constant_value,
+ PaddingMode mode)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value,
+ mode);
+}
+
+void CLPadLayerKernelEx::configure(const CLCompileContext &compile_context, const ICLTensor *input,
+ ICLTensor *output, const PaddingList &padding,
+ PixelValue constant_value, PaddingMode mode)
+{
+ ARM_COMPUTE_UNUSED(compile_context);
+ // Perform validation step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), output->info(), padding, constant_value, mode));
+
+ _input = input;
+ _output = output;
+ _4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3);
+
+ // Configure window
+ unsigned int vec_size;
+ auto win_config = validate_and_configure_window(input->info(), output->info(), padding,
+ constant_value, mode, vec_size);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Set build options
+ std::string kernel_name = "pad_layer_";
+
+ const DataType &data_type = input->info()->data_type();
+ const unsigned int input_width = input->info()->dimension(0);
+ const unsigned int input_height = input->info()->dimension(1);
+ const unsigned int input_depth = input->info()->dimension(2);
+ const unsigned int pad_x_before = padding.at(0).first;
+ const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0;
+ const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0;
+ const unsigned int pad_right_start = input_width + pad_x_before;
+
+ _input_start_x = mode == PaddingMode::CONSTANT ? -(pad_x_before % vec_size) : 0;
+ _input_start_y = (mode == PaddingMode::CONSTANT && padding.size() > 1) ? -padding.at(1).first : 0;
+
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DSELECT_DT=" + get_cl_select_type_from_data_type(data_type));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DPAD_X_BEFORE=" + support::cpp11::to_string(pad_x_before));
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
+ if (padding.size() > 1)
+ {
+ build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
+
+ if (padding.size() > 2)
+ {
+ build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before));
+ build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth));
+ }
+ }
+
+ switch (mode)
+ {
+ case PaddingMode::CONSTANT:
+ {
+ kernel_name += "constant";
+
+ build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type));
+ build_opts.add_option_if(pad_x_before >= vec_size,
+ "-DNUM_THREADS_TO_SKIP_X=" +
+ support::cpp11::to_string(pad_x_before / vec_size));
+
+ if (_4d_enabled)
+ {
+ build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first));
+ build_opts.add_option("-DSRC_BATCH=" +
+ support::cpp11::to_string(input->info()->dimension(3)));
+ }
+
+ break;
+ }
+ case PaddingMode::SYMMETRIC:
+ case PaddingMode::REFLECT:
+ {
+ kernel_name += "symmetric_reflect";
+
+ const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
+
+ const unsigned int pad_x_before_remainder = pad_x_before % vec_size;
+ const unsigned int pad_x_after_remainder = pad_right_start % vec_size;
+ const unsigned int after_pad_fact_x = (2 * input_width + pad_x_before) - is_reflect;
+ const unsigned int output_last_x =
+ ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
+
+ build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect));
+ build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" +
+ support::cpp11::to_string(pad_x_before_remainder));
+ build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" +
+ support::cpp11::to_string(pad_x_after_remainder));
+ build_opts.add_option(
+ "-DPAD_X_BEFORE_REMAINDER_REFL=" +
+ support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
+ build_opts.add_option(
+ "-DPAD_X_AFTER_REMAINDER_REFL=" +
+ support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
+ build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x));
+ build_opts.add_option_if(after_pad_fact_x < output_last_x,
+ "-DAFTER_PAD_REM=" +
+ support::cpp11::to_string(after_pad_fact_x % vec_size));
+
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Padding mode not supported.");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
+}
+
+Status CLPadLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const PaddingList &padding, PixelValue constant_value,
+ PaddingMode mode)
+{
+ unsigned int vec_size;
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+ output->clone().get(), padding,
+ constant_value, mode, vec_size)
+ .first);
+
+ return Status{};
+}
+
+void CLPadLayerKernelEx::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window win_in = window;
+ win_in.adjust(Window::DimX, _input_start_x, true);
+ win_in.adjust(Window::DimY, _input_start_y, true);
+
+ Window slice_out = window.first_slice_window_3D();
+ Window slice_in = win_in.first_slice_window_3D();
+ unsigned int batch = 0;
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_in);
+ add_3D_tensor_argument(idx, _output, slice_out);
+ if (_4d_enabled)
+ {
+ add_argument<unsigned int>(idx, batch++);
+ }
+
+ enqueue(queue, *this, slice_out, lws_hint());
+ } while (window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
index 1a7a18cfa..9aa815f55 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
@@ -40,15 +40,19 @@
#include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h"
-#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
#include "support/StringSupport.h"
namespace arm_compute
@@ -87,9 +91,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
if (multi_access_x)
{
- win.set(Window::DimX,
- Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x),
- vec_size_x));
+ win.set(
+ Window::DimX,
+ Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
}
Coordinates coord;
@@ -101,7 +105,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
} // namespace
CLQuantizationSymmetricKernel::CLQuantizationSymmetricKernel()
- : _input(nullptr), _scale_factor(nullptr), _output(nullptr)
+ : _input(nullptr), _scale_factor(nullptr), _output(nullptr)
{
}
@@ -110,7 +114,7 @@ void CLQuantizationSymmetricKernel::configure(const ICLTensor *input, const ICLT
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, scale_factor, output);
ARM_COMPUTE_ERROR_THROW_ON(
- validate_arguments(input->info(), scale_factor->info(), output->info()));
+ validate_arguments(input->info(), scale_factor->info(), output->info()));
_input = input;
_scale_factor = scale_factor;
@@ -132,11 +136,11 @@ void CLQuantizationSymmetricKernel::configure(const ICLTensor *input, const ICLT
build_opts.add_option("-DDATA_TYPE_OUT=" +
get_cl_type_from_data_type(output->info()->data_type()));
build_opts.add_option_if(
- multi_access_x, "-DLAST_ACCESSED_X=" +
- support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
+ multi_access_x,
+ "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
_kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("quantization_symm8", build_opts.options()));
+ CLKernelLibraryEx::get().create_kernel("quantization_symm8", build_opts.options()));
}
Status CLQuantizationSymmetricKernel::validate(const ITensorInfo *input,
@@ -145,7 +149,7 @@ Status CLQuantizationSymmetricKernel::validate(const ITensorInfo *input,
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output));
ARM_COMPUTE_RETURN_ON_ERROR(
- validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+ validate_and_configure_window(input->clone().get(), output->clone().get()).first);
return Status{};
}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
index 06c2579f2..70374ba61 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -43,6 +43,9 @@
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+
#include "support/StringSupport.h"
using namespace arm_compute;
@@ -63,7 +66,7 @@ const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_
namespace
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
- ReduceOperation op)
+ ReductionOperation op)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -74,7 +77,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
DataType::F32, DataType::S32);
- if (op == ReduceOperation::SUM)
+ if (op == ReductionOperation::SUM)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8,
"Not support QASYMM8, yet");
@@ -98,7 +101,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output,
- const uint32_t axis, ReduceOperation op)
+ const uint32_t axis, ReductionOperation op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -114,22 +117,22 @@ void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *outpu
// Construct kernel name
std::string kernel_name;
int op_code = 0;
- if (op == ReduceOperation::MAX)
+ if (op == ReductionOperation::MAX)
{
kernel_name = "reduce_min_max";
op_code = 1;
}
- else if (op == ReduceOperation::MIN)
+ else if (op == ReductionOperation::MIN)
{
kernel_name = "reduce_min_max";
op_code = 2;
}
- else if (op == ReduceOperation::SUM)
+ else if (op == ReductionOperation::SUM)
{
kernel_name = "reduce_sum_mean";
op_code = 3;
}
- else if (op == ReduceOperation::MEAN)
+ else if (op == ReductionOperation::MEAN_SUM)
{
kernel_name = "reduce_sum_mean";
op_code = 4;
@@ -145,7 +148,7 @@ void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *outpu
// Create kernel
_kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+ static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
// Configure kernel window
Window win = calculate_max_window(*output_info, Steps());
@@ -158,7 +161,7 @@ void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *outpu
}
Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const uint32_t axis, ReduceOperation op)
+ const uint32_t axis, ReductionOperation op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
index 8d8853c81..c9d6dc31c 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
@@ -40,7 +40,7 @@
#include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h"
-#include "arm_compute/core/AccessWindowStatic.h"
+#include "src/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibraryEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
@@ -48,6 +48,10 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
#include "support/StringSupport.h"
#include <climits>
@@ -94,8 +98,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
Status err = (window_changed)
- ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
- : Status{};
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
return std::make_tuple(err, win);
}
} // namespace
@@ -115,7 +119,7 @@ void CLScaleFactorSymm8Kernel::configure(const ICLTensor *input, ICLTensor *outp
// Create kernel
_kernel = static_cast<cl::Kernel>(
- CLKernelLibraryEx::get().create_kernel("scale_factor_symm8", build_opts));
+ CLKernelLibraryEx::get().create_kernel("scale_factor_symm8", build_opts));
auto win_config = validate_and_configure_window(input->info(), output->info());
@@ -128,7 +132,7 @@ Status CLScaleFactorSymm8Kernel::validate(const ITensorInfo *input, const ITenso
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
ARM_COMPUTE_RETURN_ON_ERROR(
- std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+ std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
return Status{};
}
diff --git a/compute/ARMComputeEx/src/core/CPP/kernels/CPPOneHotKernelEx.cpp b/compute/ARMComputeEx/src/core/CPP/kernels/CPPOneHotKernelEx.cpp
deleted file mode 100644
index 480532388..000000000
--- a/compute/ARMComputeEx/src/core/CPP/kernels/CPPOneHotKernelEx.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-
-namespace arm_compute
-{
-CPPOneHotKernelEx::CPPOneHotKernelEx()
- : _indices(nullptr), _depth(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr),
- _axis(-1)
-{
-}
-
-void CPPOneHotKernelEx::configure(const ITensor *indices, const ITensor *depth,
- const ITensor *on_value, const ITensor *off_value,
- ITensor *output, const int axis)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(indices, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate(indices, depth, on_value, off_value, axis));
-
- _indices = indices;
- _depth = depth;
- _on_value = on_value;
- _off_value = off_value;
- _output = output;
- _axis = axis;
-
- ICPPKernel::configure(Window()); // Default 1 iteration window
-}
-
-Status CPPOneHotKernelEx::validate(const ITensor *indices, const ITensor *depth,
- const ITensor *on_value, const ITensor *off_value,
- const int axis)
-{
- ARM_COMPUTE_UNUSED(on_value, off_value);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(indices, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(depth, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices->info()->num_dimensions() != 1,
- "Only 1D indices are supported.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != -1, "Only axis = -1 is supported.");
- return Status{};
-}
-
-bool CPPOneHotKernelEx::is_parallelisable() const { return false; }
-
-void CPPOneHotKernelEx::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
-
- const auto num_indices = _indices->info()->dimension(0);
- const auto depth = *reinterpret_cast<int32_t *>(_depth->ptr_to_element(Coordinates{0}));
- const auto dtype = _output->info()->data_type();
- switch (dtype)
- {
- case DataType::F32:
- {
- const auto on_value = *reinterpret_cast<float *>(_on_value->ptr_to_element(Coordinates{0}));
- const auto off_value = *reinterpret_cast<float *>(_off_value->ptr_to_element(Coordinates{0}));
- for (size_t i = 0; i < num_indices; ++i)
- {
- const auto index = *reinterpret_cast<int32_t *>(_indices->ptr_to_element(Coordinates{i}));
- for (int d = 0; d < depth; ++d)
- *reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(d, i))) =
- (d == index) ? on_value : off_value;
- }
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
deleted file mode 100644
index 254c33ea9..000000000
--- a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
-
-#include <algorithm>
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-
-namespace
-{
-void store_quantized_int32(uint8_t *output_ptr, const int32x4x4_t &out)
-{
- const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
- const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
- vst1q_u8(output_ptr, vcombine_u8(pa, pb));
-}
-
-using namespace arm_compute;
-template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
-void elementwise_op_templ(
- const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
- int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &,
- OutputScalarType *, const bool),
- int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *,
- OutputScalarType *))
-{
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
-
- if (is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(out, win);
-
- execute_window_loop(win,
- [&](const Coordinates &) {
- auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
- const auto non_broadcast_input_ptr =
- reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
- const InputScalarType broadcast_value =
- *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
- int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x,
- non_broadcast_input_ptr, broadcast_value,
- output_ptr, !is_broadcast_input_2);
- for (; x < window_end_x; ++x)
- {
- const auto a = *(non_broadcast_input_ptr + x);
- *(output_ptr + x) =
- (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a,
- !is_broadcast_input_2 ? a : broadcast_value);
- }
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
-
- execute_window_loop(win,
- [&](const Coordinates &) {
- auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
- const auto input1_ptr =
- reinterpret_cast<const InputScalarType *>(input1.ptr());
- const auto input2_ptr =
- reinterpret_cast<const InputScalarType *>(input2.ptr());
-
- int x = (*neon_func)(window_start_x, window_end_x, window_step_x,
- input1_ptr, input2_ptr, output_ptr);
- for (; x < window_end_x; ++x)
- {
- const auto a = *(input1_ptr + x);
- const auto b = *(input2_ptr + x);
- *(output_ptr + x) = (*scalar_func)(a, b);
- }
- },
- input1, input2, output);
- }
-}
-
-} // namespace
-
-namespace arm_compute
-{
-
-float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset,
- const float32x4_t &scale)
-{
- qasymm8x16_t x = vld1q_u8(input1_ptr);
- const float32x4x4_t out = {{
- vmulq_f32(
- vcvtq_f32_s32(vsubq_s32(
- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)),
- scale),
- vmulq_f32(
- vcvtq_f32_s32(vsubq_s32(
- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)),
- scale),
- vmulq_f32(
- vcvtq_f32_s32(vsubq_s32(
- vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)),
- scale),
- vmulq_f32(
- vcvtq_f32_s32(vsubq_s32(
- vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)),
- scale),
- }};
- return out;
-}
-
-void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset,
- const float32x4_t &invscale)
-{
- int32x4x4_t out = {{
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
- }};
- store_quantized_int32(output_ptr, out);
-}
-
-float32x4x4_t dup_quantized(uint8_t broadcast_value, int offset, float scale)
-{
- const qasymm8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
- const int32x4_t voffset = vdupq_n_s32(offset);
- const float32x4_t vscale = vdupq_n_f32(scale);
-
- const float32x4x4_t broadcast_vector = {{
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(
- vmovl_u8(vget_low_u8(broadcast_value_vec))))),
- voffset)),
- vscale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(
- vmovl_u8(vget_low_u8(broadcast_value_vec))))),
- voffset)),
- vscale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(
- vmovl_u8(vget_high_u8(broadcast_value_vec))))),
- voffset)),
- vscale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(
- vmovl_u8(vget_high_u8(broadcast_value_vec))))),
- voffset)),
- vscale),
- }};
- return broadcast_vector;
-}
-
-void elementwise_op_quantized(
- const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- uint8_t (*scalar_func)(const float &, const float &, QuantizationInfo),
- int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t,
- float32x4_t, float32x4_t, float32x4_t, const bool),
- int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t,
- int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))
-{
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
-
- UniformQuantizationInfo qinfo = out->info()->quantization_info().uniform();
- const float output_scale = qinfo.scale;
- const int output_offset = qinfo.offset;
-
- // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from
- // zero)
- const float32x4_t voffseto = vdupq_n_f32(output_offset + 0.5f);
- const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_scale);
-
- if (is_broadcast_across_x)
- {
- // Select the broadcast input on the X axis
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
- const UniformQuantizationInfo broadcast_qinfo =
- broadcast_tensor->info()->quantization_info().uniform();
- const UniformQuantizationInfo non_broadcast_qinfo =
- non_broadcast_tensor->info()->quantization_info().uniform();
-
- const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
- const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(out, win);
-
- execute_window_loop(
- win,
- [&](const Coordinates &) {
- const auto non_broadcast_input_ptr =
- reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
- const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
- const float32x4x4_t broadcast_vector =
- dup_quantized(broadcast_value, broadcast_qinfo.offset, broadcast_qinfo.scale);
-
- int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x,
- non_broadcast_input_ptr, broadcast_vector, output_ptr,
- voffset_non_broadcast, vscale_non_broadcast, voffseto,
- invvscaleo, !is_broadcast_input_2);
- for (; x < window_end_x; ++x)
- {
- const float afs =
- dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
- const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
- *(output_ptr + x) =
- (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs,
- out->info()->quantization_info());
- }
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- // Input1 quantization info
- UniformQuantizationInfo qinfo = in1->info()->quantization_info().uniform();
- const int32x4_t voffset1 = vdupq_n_s32(qinfo.offset);
- const float32x4_t vscale1 = vdupq_n_f32(qinfo.scale);
-
- // Input2 quantization info
- qinfo = in2->info()->quantization_info().uniform();
- const int32x4_t voffset2 = vdupq_n_s32(qinfo.offset);
- const float32x4_t vscale2 = vdupq_n_f32(qinfo.scale);
-
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const QuantizationInfo input1_qinfo = in1->info()->quantization_info();
- const QuantizationInfo input2_qinfo = in2->info()->quantization_info();
-
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
-
- execute_window_loop(win,
- [&](const Coordinates &) {
- const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
- int x = (*neon_func)(window_start_x, window_end_x, window_step_x,
- input1_ptr, input2_ptr, output_ptr, voffset1,
- voffset2, vscale1, vscale2, voffseto, invvscaleo);
- for (; x < window_end_x; ++x)
- {
- const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
- const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
- *(output_ptr + x) =
- (*scalar_func)(afs, bfs, out->info()->quantization_info());
- }
- },
- input1, input2, output);
- }
-}
-
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- float (*scalar_func)(const float &, const float &),
- int (*broadcast_func)(int, int, int, const float *, const float &, float *,
- const bool),
- int (*neon_func)(int, int, int, const float *, const float *, float *))
-{
- elementwise_op_templ<float, float, float32x4_t>(in1, in2, out, window, scalar_func,
- broadcast_func, neon_func);
-}
-
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- uint8_t (*scalar_func)(const uint8_t &, const uint8_t &),
- int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &,
- uint8_t *, const bool),
- int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *))
-{
- elementwise_op_templ<uint8_t, uint8_t, uint8x16_t>(in1, in2, out, window, scalar_func,
- broadcast_func, neon_func);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEActivationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEActivationLayerKernelEx.cpp
deleted file mode 100644
index 648705ba9..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEActivationLayerKernelEx.cpp
+++ /dev/null
@@ -1,730 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/NESymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-#include <array>
-#include <cmath>
-#include <map>
-#include <set>
-
-using namespace arm_compute;
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const ActivationLayerInfo &activation_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
- input, 1, DataType::U8, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
-
- static std::set<ActivationLayerInfo::ActivationFunction> qasymm8_supported_activations = {
- ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LOGISTIC,
- ActivationLayerInfo::ActivationFunction::TANH};
- static std::set<ActivationLayerInfo::ActivationFunction> qsymm16_supported_activations = {
- ActivationLayerInfo::ActivationFunction::LOGISTIC,
- ActivationLayerInfo::ActivationFunction::TANH};
- const DataType data_type = input->data_type();
- const QuantizationInfo &oq_info =
- (output != nullptr) ? output->quantization_info() : input->quantization_info();
- const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation();
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- is_data_type_quantized_asymmetric(data_type) &&
- (qasymm8_supported_activations.count(f_act) == 0),
- "For QASYMM8 only tanh, logistic, relu and lower/upper bounded relu are supported");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) &&
- (qsymm16_supported_activations.count(f_act) == 0),
- "For QSYMM16 only tanh and logistic are supported");
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(data_type) &&
- (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
- (oq_info != QuantizationInfo(1.f / 128.f, 128)));
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(data_type) &&
- (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
- (oq_info != QuantizationInfo(1.f / 256.f, 0)));
-
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
- (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
- (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
- (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
- (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-
- // Checks performed when output is configured
- if ((output != nullptr) && (output->total_size() != 0))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps());
-
- if (output != nullptr)
- {
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, *input->clone());
-
- // NEActivationLayerKernelEx doesn't need padding so update_window_and_padding() can be skipped
- Coordinates coord;
- coord.set_num_dimensions(output->num_dimensions());
- output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
- }
-
- return std::make_pair(Status{}, win);
-}
-
-inline uint32x4_t vreinterpret_unsigend_int(const float32x4_t &vec)
-{
- return vreinterpretq_u32_f32(vec);
-}
-
-inline float32x4_t vreinterpret_floating_point(const uint32x4_t &vec)
-{
- return vreinterpretq_f32_u32(vec);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-inline uint16x8_t vreinterpret_unsigend_int(const float16x8_t &vec)
-{
- return vreinterpretq_u16_f16(vec);
-}
-inline float16x8_t vreinterpret_floating_point(const uint16x8_t &vec)
-{
- return vreinterpretq_f16_u16(vec);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
-} // namespace
-
-NEActivationLayerKernelEx::NEActivationLayerKernelEx()
- : _input(nullptr), _output(nullptr), _func(nullptr), _act_info()
-{
-}
-
-void NEActivationLayerKernelEx::configure(ITensor *input, ITensor *output,
- ActivationLayerInfo activation_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-
- _input = input;
- _act_info = activation_info;
- _output = input;
-
- // Out-of-place calculation
- if (output != nullptr)
- {
- _output = output;
- }
-
- // Disabled activation, thus no operation needed
- if (!activation_info.enabled())
- {
- _func = nullptr;
- }
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
- input->info(), (output != nullptr) ? output->info() : nullptr, activation_info));
-
- // Activation functions : FP32
- static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 = {
- {ActivationFunction::ABS,
- &NEActivationLayerKernelEx::activation<ActivationFunction::ABS, float>},
- {ActivationFunction::LINEAR,
- &NEActivationLayerKernelEx::activation<ActivationFunction::LINEAR, float>},
- {ActivationFunction::LOGISTIC,
- &NEActivationLayerKernelEx::activation<ActivationFunction::LOGISTIC, float>},
- {ActivationFunction::RELU,
- &NEActivationLayerKernelEx::activation<ActivationFunction::RELU, float>},
- {ActivationFunction::BOUNDED_RELU,
- &NEActivationLayerKernelEx::activation<ActivationFunction::BOUNDED_RELU, float>},
- {ActivationFunction::LU_BOUNDED_RELU,
- &NEActivationLayerKernelEx::activation<ActivationFunction::LU_BOUNDED_RELU, float>},
- {ActivationFunction::LEAKY_RELU,
- &NEActivationLayerKernelEx::activation<ActivationFunction::LEAKY_RELU, float>},
- {ActivationFunction::SOFT_RELU,
- &NEActivationLayerKernelEx::activation<ActivationFunction::SOFT_RELU, float>},
- {ActivationFunction::ELU,
- &NEActivationLayerKernelEx::activation<ActivationFunction::ELU, float>},
- {ActivationFunction::SQRT,
- &NEActivationLayerKernelEx::activation<ActivationFunction::SQRT, float>},
- {ActivationFunction::SQUARE,
- &NEActivationLayerKernelEx::activation<ActivationFunction::SQUARE, float>},
- {ActivationFunction::TANH,
- &NEActivationLayerKernelEx::activation<ActivationFunction::TANH, float>},
- {ActivationFunction::IDENTITY,
- &NEActivationLayerKernelEx::activation<ActivationFunction::IDENTITY, float>},
- };
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- // Activation functions : FP16
- static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f16 = {
- {ActivationFunction::ABS,
- &NEActivationLayerKernelEx::activation<ActivationFunction::ABS, float16_t>},
- {ActivationFunction::LINEAR,
- &NEActivationLayerKernelEx::activation<ActivationFunction::LINEAR, float16_t>},
- {ActivationFunction::LOGISTIC,
- &NEActivationLayerKernelEx::activation<ActivationFunction::LOGISTIC, float16_t>},
- {ActivationFunction::RELU,
- &NEActivationLayerKernelEx::activation<ActivationFunction::RELU, float16_t>},
- {ActivationFunction::BOUNDED_RELU,
- &NEActivationLayerKernelEx::activation<ActivationFunction::BOUNDED_RELU, float16_t>},
- {ActivationFunction::LU_BOUNDED_RELU,
- &NEActivationLayerKernelEx::activation<ActivationFunction::LU_BOUNDED_RELU, float16_t>},
- {ActivationFunction::LEAKY_RELU,
- &NEActivationLayerKernelEx::activation<ActivationFunction::LEAKY_RELU, float16_t>},
- {ActivationFunction::SOFT_RELU,
- &NEActivationLayerKernelEx::activation<ActivationFunction::SOFT_RELU, float16_t>},
- {ActivationFunction::ELU,
- &NEActivationLayerKernelEx::activation<ActivationFunction::ELU, float16_t>},
- {ActivationFunction::SQRT,
- &NEActivationLayerKernelEx::activation<ActivationFunction::SQRT, float16_t>},
- {ActivationFunction::SQUARE,
- &NEActivationLayerKernelEx::activation<ActivationFunction::SQUARE, float16_t>},
- {ActivationFunction::TANH,
- &NEActivationLayerKernelEx::activation<ActivationFunction::TANH, float16_t>},
- {ActivationFunction::IDENTITY,
- &NEActivationLayerKernelEx::activation<ActivationFunction::IDENTITY, float16_t>},
- };
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
-
- // Activation functions : QASYMM8
- static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qasymm8 = {
- {ActivationFunction::LOGISTIC,
- &NEActivationLayerKernelEx::activation<ActivationFunction::LOGISTIC, qasymm8_t>},
- {ActivationFunction::BOUNDED_RELU,
- &NEActivationLayerKernelEx::activation<ActivationFunction::BOUNDED_RELU, qasymm8_t>},
- {ActivationFunction::LU_BOUNDED_RELU,
- &NEActivationLayerKernelEx::activation<ActivationFunction::LU_BOUNDED_RELU, qasymm8_t>},
- {ActivationFunction::RELU,
- &NEActivationLayerKernelEx::activation<ActivationFunction::RELU, qasymm8_t>},
- {ActivationFunction::TANH,
- &NEActivationLayerKernelEx::activation<ActivationFunction::TANH, qasymm8_t>},
- {ActivationFunction::IDENTITY,
- &NEActivationLayerKernelEx::activation<ActivationFunction::IDENTITY, qasymm8_t>},
- };
-
- // Activation functions : QSYMM16
- static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qsymm16 = {
- {ActivationFunction::LOGISTIC,
- &NEActivationLayerKernelEx::activation<ActivationFunction::LOGISTIC, qsymm16_t>},
- {ActivationFunction::TANH,
- &NEActivationLayerKernelEx::activation<ActivationFunction::TANH, qsymm16_t>},
- };
-
- switch (input->info()->data_type())
- {
- case DataType::QASYMM8:
- _func = act_map_qasymm8[activation_info.activation()];
- break;
- case DataType::QSYMM16:
- _func = act_map_qsymm16[activation_info.activation()];
- break;
- case DataType::F32:
- _func = act_map_f32[activation_info.activation()];
- break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- _func = act_map_f16[activation_info.activation()];
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- }
-
- // Configure kernel window
- auto win_config =
- validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICPPKernel::configure(win_config.second);
-}
-
-template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-NEActivationLayerKernelEx::activation(const Window &window)
-{
- /** NEON vector tag type. */
- using ExactTagType =
- typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
- const int window_step_x = 16 / sizeof(T);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const ActivationFunction act = F;
-
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(_input, win_collapsed);
- Iterator output(_output, win_collapsed);
-
- const auto infinity = wrapper::vdup_n(std::numeric_limits<T>::infinity(), ExactTagType{});
- const auto epsilon = wrapper::vdup_n(static_cast<T>(1e-24), ExactTagType{});
- const auto const_1 = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
- const auto const_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
- const auto va = wrapper::vdup_n(static_cast<T>(_act_info.a()), ExactTagType{});
- const auto vb = wrapper::vdup_n(static_cast<T>(_act_info.b()), ExactTagType{});
- const auto a = static_cast<T>(_act_info.a());
- const auto b = static_cast<T>(_act_info.b());
-
- execute_window_loop(
- win_collapsed,
- [&](const Coordinates &) {
- const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
- const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
- wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
-
- // Compute S elements per iteration
- int x = window_start_x;
-
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin = wrapper::vloadq(input_ptr + x);
- switch (act)
- {
- case ActivationFunction::ABS:
- tmp = wrapper::vabs(vin);
- break;
- case ActivationFunction::LINEAR:
- tmp = wrapper::vmla(vb, va, vin);
- break;
- case ActivationFunction::LOGISTIC:
- // exp(-vin)
- tmp = wrapper::vexpq(wrapper::vneg(vin));
-
- // NaN -> INF
- tmp = vreinterpret_floating_point(wrapper::vorr(
- wrapper::vand(wrapper::vnot(wrapper::vceq(tmp, tmp)),
- vreinterpret_unsigend_int(infinity)),
- wrapper::vand(wrapper::vceq(tmp, tmp), vreinterpret_unsigend_int(tmp))));
-
- // 1 / 1 + tmp
- tmp = wrapper::vinv(wrapper::vadd(const_1, tmp));
- break;
- case ActivationFunction::RELU:
- tmp = wrapper::vmax(const_0, vin);
- break;
- case ActivationFunction::BOUNDED_RELU:
- tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
- break;
- case ActivationFunction::LU_BOUNDED_RELU:
- tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
- break;
- case ActivationFunction::LEAKY_RELU:
- tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
- break;
- case ActivationFunction::SOFT_RELU:
- tmp = wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)));
- break;
- case ActivationFunction::ELU:
- tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin,
- wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
- break;
- case ActivationFunction::SQRT:
- tmp = wrapper::vinv(wrapper::vinvsqrt(vin + epsilon));
- break;
- case ActivationFunction::SQUARE:
- tmp = wrapper::vmul(vin, vin);
- break;
- case ActivationFunction::TANH:
- tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
- break;
- case ActivationFunction::IDENTITY:
- tmp = vin;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- wrapper::vstore(output_ptr + x, tmp);
- }
-
- // Compute left-over elements
- for (; x < window_end_x; ++x)
- {
- const T in = *(reinterpret_cast<const T *>(input_ptr + x));
- T tmp;
- switch (act)
- {
- case ActivationFunction::ABS:
- tmp = std::abs(in);
- break;
- case ActivationFunction::LINEAR:
- tmp = a * in + b;
- break;
- case ActivationFunction::LOGISTIC:
- tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
- break;
- case ActivationFunction::RELU:
- tmp = std::max<T>(static_cast<T>(0), in);
- break;
- case ActivationFunction::BOUNDED_RELU:
- tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
- break;
- case ActivationFunction::LU_BOUNDED_RELU:
- tmp = std::min<T>(a, std::max<T>(b, in));
- break;
- case ActivationFunction::LEAKY_RELU:
- tmp = (in > 0) ? in : a * in;
- break;
- case ActivationFunction::SOFT_RELU:
- tmp = std::log(static_cast<T>(1) + std::exp(in));
- break;
- case ActivationFunction::ELU:
- tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
- break;
- case ActivationFunction::SQRT:
- tmp = std::sqrt(in);
- break;
- case ActivationFunction::SQUARE:
- tmp = in * in;
- break;
- case ActivationFunction::TANH:
- tmp = a * std::tanh(b * in);
- break;
- case ActivationFunction::IDENTITY:
- tmp = in;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- *(output_ptr + x) = tmp;
- }
- },
- input, output);
-}
-
-template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type
-NEActivationLayerKernelEx::activation(const Window &window)
-{
- const int window_step_x = 16 / sizeof(T);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const ActivationFunction act = F;
-
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(_input, win_collapsed);
- Iterator output(_output, win_collapsed);
-
- const UniformQuantizationInfo qi_in = _input->info()->quantization_info().uniform();
- const UniformQuantizationInfo qi_out = _output->info()->quantization_info().uniform();
- const qasymm8x16_t va = vdupq_n_u8(quantize_qasymm8(_act_info.a(), qi_in));
- const qasymm8x16_t vb = vdupq_n_u8(quantize_qasymm8(_act_info.b(), qi_in));
- const qasymm8_t a = quantize_qasymm8(_act_info.a(), qi_in);
- const qasymm8_t b = quantize_qasymm8(_act_info.b(), qi_in);
- const qasymm8_t const_0 = quantize_qasymm8(0.f, qi_in);
- const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0);
- const auto vconst_1 = vdupq_n_f32(1.f);
- const float32x4_t va_f32 = vdupq_n_f32(_act_info.a());
- const float32x4_t vb_f32 = vdupq_n_f32(_act_info.b());
- const float a_f32 = _act_info.a();
- const float b_f32 = _act_info.b();
-
- // Initialise scale/offset for re-quantization
- float s = qi_in.scale / qi_out.scale;
- float o = -qi_in.offset * s + qi_out.offset;
- float32x4_t vs = vdupq_n_f32(s);
- float32x4_t vo = vdupq_n_f32(o);
-
- execute_window_loop(
- win_collapsed,
- [&](const Coordinates &) {
- const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
- const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
- wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
-
- // Compute S elements per iteration
- int x = window_start_x;
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin = wrapper::vloadq(input_ptr + x);
- if (act == ActivationFunction::RELU)
- {
- // Perform activation
- tmp = vmaxq_u8(vconst_0, vin);
- // Re-quantize to new output space
- tmp = vmlaq_qasymm8(tmp, vs, vo);
- }
- else if (act == ActivationFunction::BOUNDED_RELU)
- {
- // Perform activation
- tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
- // Re-quantize to new output space
- tmp = vmlaq_qasymm8(tmp, vs, vo);
- }
- else if (act == ActivationFunction::LU_BOUNDED_RELU)
- {
- // Perform activation
- tmp = vminq_u8(va, vmaxq_u8(vb, vin));
- // Re-quantize to new output space
- tmp = vmlaq_qasymm8(tmp, vs, vo);
- }
- else if (act == ActivationFunction::LOGISTIC)
- {
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
- // Perform activation
- const float32x4x4_t tmp_dep = {{
- wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
- vin_deq.val[0])))),
- wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
- vin_deq.val[1])))),
- wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
- vin_deq.val[2])))),
- wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
- vin_deq.val[3])))),
- }};
- // Re-quantize to new output space
- tmp = vquantize(tmp_dep, qi_out);
- }
- else if (act == ActivationFunction::TANH)
- {
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
- // Perform activation
- const float32x4x4_t tmp_dep = {{
- wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
- wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
- wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
- wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
- }};
- // Re-quantize to new output space
- tmp = vquantize(tmp_dep, qi_out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- wrapper::vstore(output_ptr + x, tmp);
- }
-
- // Compute left-over elements
- for (; x < window_end_x; ++x)
- {
- T in = *(reinterpret_cast<const T *>(input_ptr + x));
- T tmp;
- if (act == ActivationFunction::RELU)
- {
- tmp = std::max(const_0, in);
- tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255));
- }
- else if (act == ActivationFunction::BOUNDED_RELU)
- {
- tmp = std::min(a, std::max(const_0, in));
- tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255));
- }
- else if (act == ActivationFunction::LU_BOUNDED_RELU)
- {
- tmp = std::min(a, std::max(b, in));
- tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255));
- }
- else if (act == ActivationFunction::LOGISTIC)
- {
- float tmp_f = dequantize_qasymm8(in, qi_in);
- tmp_f = 1.f / (1.f + std::exp(-tmp_f));
- tmp = quantize_qasymm8(tmp_f, qi_out);
- }
- else if (act == ActivationFunction::TANH)
- {
- float tmp_f = dequantize_qasymm8(in, qi_in);
- tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
- tmp = quantize_qasymm8(tmp_f, qi_out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- *(output_ptr + x) = tmp;
- }
- },
- input, output);
-}
-
-template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, qsymm16_t>::value, void>::type
-NEActivationLayerKernelEx::activation(const Window &window)
-{
- const int window_step_x = 16 / sizeof(T);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const ActivationFunction act = F;
-
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(_input, win_collapsed);
- Iterator output(_output, win_collapsed);
-
- const UniformQuantizationInfo qi_in = _input->info()->quantization_info().uniform();
- const UniformQuantizationInfo qi_out = _output->info()->quantization_info().uniform();
- const auto vconst_1 = vdupq_n_f32(1.f);
- const float32x4_t va_f32 = vdupq_n_f32(_act_info.a());
- const float32x4_t vb_f32 = vdupq_n_f32(_act_info.b());
- const float a_f32 = _act_info.a();
- const float b_f32 = _act_info.b();
-
- execute_window_loop(
- win_collapsed,
- [&](const Coordinates &) {
- const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
- const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
- wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
- ARM_COMPUTE_UNUSED(tmp);
-
- // Compute S elements per iteration
- int x = window_start_x;
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin = wrapper::vloadq(input_ptr + x);
- if (act == ActivationFunction::LOGISTIC)
- {
- // De-quantize
- const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
- // Perform activation
- const float32x4x2_t tmp_dep = {{
- wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
- vin_deq.val[0])))),
- wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
- vin_deq.val[1])))),
- }};
- // Re-quantize to new output space
- tmp = vquantize_int16(tmp_dep, qi_out.scale);
- }
- else if (act == ActivationFunction::TANH)
- {
- // De-quantize
- const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
- // Perform activation
- const float32x4x2_t tmp_dep = {{
- wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
- wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
- }};
- // Re-quantize to new output space
- tmp = vquantize_int16(tmp_dep, qi_out.scale);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- wrapper::vstore(output_ptr + x, tmp);
- }
-
- // Compute left-over elements
- for (; x < window_end_x; ++x)
- {
- T in = *(reinterpret_cast<const T *>(input_ptr + x));
- T tmp;
- if (act == ActivationFunction::LOGISTIC)
- {
- float tmp_f = dequantize_qsymm16(in, qi_in.scale);
- tmp_f = 1.f / (1.f + std::exp(-tmp_f));
- tmp = quantize_qsymm16(tmp_f, qi_out);
- }
- else if (act == ActivationFunction::TANH)
- {
- float tmp_f = dequantize_qsymm16(in, qi_in.scale);
- tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
- tmp = quantize_qsymm16(tmp_f, qi_out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- *(output_ptr + x) = tmp;
- }
- },
- input, output);
-}
-
-Status NEActivationLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
- const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_UNUSED(act_info);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(
- validate_and_configure_window(input->clone().get(),
- (output != nullptr) ? output->clone().get() : nullptr)
- .first);
-
- return Status{};
-}
-
-void NEActivationLayerKernelEx::run(const Window &window, const ThreadInfo &info)
-{
- // Early exit on disabled activation
- if (!_act_info.enabled())
- {
- return;
- }
-
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
- (this->*_func)(window);
-}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
deleted file mode 100644
index 32d7d6237..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-
-#include <algorithm>
-#include <arm_neon.h>
-#include <map>
-#include <string>
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-namespace arm_compute
-{
-
-template <BinaryLogicalOperation op, typename ScalarType>
-inline ScalarType elementwise_logic_op_scalar(const ScalarType &a, const ScalarType &b)
-{
- auto res = ScalarType(0);
-
- switch (op)
- {
- case BinaryLogicalOperation::AND:
- res = a & b;
- break;
- case BinaryLogicalOperation::OR:
- res = a | b;
- break;
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
- return res;
-}
-
-template <BinaryLogicalOperation op, typename VectorType>
-inline VectorType elementwise_logic_op(const VectorType &a, const VectorType &b)
-{
- VectorType res = {0, 0, 0, 0};
-
- switch (op)
- {
- case BinaryLogicalOperation::AND:
- res = wrapper::vand(a, b);
- break;
- case BinaryLogicalOperation::OR:
- res = wrapper::vorr(a, b);
- break;
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
- return res;
-}
-
-template <BinaryLogicalOperation op>
-inline uint8x16x4_t elementwise_logic_op(const uint8x16x4_t &a, const uint8x16x4_t &b)
-{
- uint8x16x4_t out = {{
- elementwise_logic_op<op>(a.val[0], b.val[0]), elementwise_logic_op<op>(a.val[1], b.val[1]),
- elementwise_logic_op<op>(a.val[2], b.val[2]), elementwise_logic_op<op>(a.val[3], b.val[3]),
- }};
- return out;
-}
-
-template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
-inline VectorType elementwise_logic_op_broadcast(const VectorType &a,
- const ScalarType &broadcast_value,
- const bool reorder)
-{
- VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
- return elementwise_logic_op<op>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
-}
-
-template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_logic_op_loop(int window_start_x, int window_end_x, int window_step_x,
- const ScalarType *input1_ptr, const ScalarType *input2_ptr,
- ScalarType *output_ptr)
-{
- int x = window_start_x;
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq(input1_ptr + x);
- const auto b = wrapper::vloadq(input2_ptr + x);
- wrapper::vstore(output_ptr + x, elementwise_logic_op<op>(a, b));
- }
- return x;
-}
-
-template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_logic_op_broadcast_loop(int window_start_x, int window_end_x,
- int window_step_x,
- const ScalarType *non_broadcast_input_ptr,
- const ScalarType &broadcast_value,
- ScalarType *output_ptr, const bool reorder)
-{
- int x = window_start_x;
- for (; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
- wrapper::vstore(output_ptr + x,
- elementwise_logic_op_broadcast<op>(a, broadcast_value, reorder));
- }
- return x;
-}
-
-template <BinaryLogicalOperation op, typename ScalarType, typename VectorType>
-void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out,
- const Window &window)
-{
- elementwise_op(in1, in2, out, window, &elementwise_logic_op_scalar<op, ScalarType>,
- &elementwise_logic_op_broadcast_loop<op, ScalarType, VectorType>,
- &elementwise_logic_op_loop<op, ScalarType, VectorType>);
-}
-
-std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func(
- const ITensor *input1, const ITensor *input2, ITensor *output,
- std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function)
-{
- std::string function_to_call("op_");
- function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
- function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
- function_to_call += string_from_data_type(output->info()->data_type());
-
- auto it = map_function.find(function_to_call);
-
- if (it != map_function.end())
- {
- auto func = it->second;
- return [func](const ITensor *input1, const ITensor *input2, ITensor *output,
- const Window &window) { func(input1, input2, output, window); };
- }
- return nullptr;
-}
-
-template <BinaryLogicalOperation op>
-std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
-configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output)
-{
- static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = {
- {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>},
- {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}};
-
- return configure_func(input1, input2, output, map_function);
-}
-
-void NEBinaryLogicalOperationKernel::configure(BinaryLogicalOperation op, const ITensor *input1,
- const ITensor *input2, ITensor *output)
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
- configure_common(input1, input2, output);
- switch (op)
- {
- case BinaryLogicalOperation::AND:
- _function = configure_logic_func<BinaryLogicalOperation::AND>(input1, input2, output);
- break;
- case BinaryLogicalOperation::OR:
- _function = configure_logic_func<BinaryLogicalOperation::OR>(input1, input2, output);
- break;
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
-}
-
-Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &input1,
- const ITensorInfo &input2,
- const ITensorInfo &output)
-{
- // Validate in case of configured output
- if (output.total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8,
- DataType::QASYMM8);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
-
- const TensorShape out_shape =
- TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
- "Inputs are not broadcast compatible");
-
- // Validate in case of configured output
- if (output.total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
- "Wrong shape for output");
- }
-
- return Status{};
-}
-
-Status NEBinaryLogicalOperationKernel::validate(BinaryLogicalOperation op,
- const ITensorInfo *input1,
- const ITensorInfo *input2,
- const ITensorInfo *output)
-{
- ARM_COMPUTE_UNUSED(op);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
- return Status{};
-}
-
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
new file mode 100644
index 000000000..87e716b4f
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NECastBoolKernel.h"
+
+#include "src/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "src/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "support/SaturateCast.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
+#include "src/core/NEON/INEKernel.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
+ ARM_COMPUTE_RETURN_ERROR_ON(input == output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8,
+ DataType::S16, DataType::U16, DataType::F16,
+ DataType::U32, DataType::S32, DataType::F32);
+
+ // Validate in case of configured output
+ if (output->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
+NECastBoolKernel::NECastBoolKernel() : _input(nullptr), _output(nullptr) {}
+
+void NECastBoolKernel::configure(const ITensor *input, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype
+ // must be given)
+ set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+ _input = input;
+ _output = output;
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ ICPPKernel::configure(win);
+}
+
+Status NECastBoolKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ return Status{};
+}
+
+void NECastBoolKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output);
+ ARM_COMPUTE_ERROR_ON(_input == _output);
+
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+ const int window_step_x = 16;
+
+ Window win{window};
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(_input, win);
+ Iterator output(_output, win);
+
+ const uint8_t true_val = 1;
+ const uint8x8_t mask_bool = vdup_n_u8(true_val);
+
+ switch (_output->info()->data_type())
+ {
+ case DataType::S8:
+ {
+ /* Conversion U8 -> S8 */
+ execute_window_loop(
+ win,
+ [&](const Coordinates &) {
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+ vst1q_s8(output_ptr + x,
+ vreinterpretq_s8_u8(vandq_u8(texels_u8, vdupq_n_u8(true_val))));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) & true_val);
+ }
+ },
+ input, output);
+ break;
+ }
+ case DataType::S16:
+ {
+ /* Up-conversion U8 -> S16 */
+ execute_window_loop(
+ win,
+ [&](const Coordinates &) {
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+ const int16x8x2_t texels = {
+ {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+ vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+
+ vst1q_s16(output_ptr + x, texels.val[0]);
+ vst1q_s16(output_ptr + x + 8, texels.val[1]);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) & true_val);
+ }
+ },
+ input, output);
+ break;
+ }
+ case DataType::S32:
+ {
+ /* Up-conversion U8 -> S32 */
+ execute_window_loop(
+ win,
+ [&](const Coordinates &) {
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+ const int16x8x2_t texels = {
+ {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+ vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+
+ vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
+ vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+ vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+ vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) & true_val);
+ }
+ },
+ input, output);
+ break;
+ }
+ case DataType::F32:
+ {
+ /* Up-conversion U8 -> F32 */
+ execute_window_loop(
+ win,
+ [&](const Coordinates &) {
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+ const int16x8x2_t texels = {
+ {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+ vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+ vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
+ vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
+ vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
+ vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ auto in = static_cast<uint32_t>(*(input_ptr + x) & true_val);
+ *(output_ptr + x) = static_cast<float>(in);
+ }
+ },
+ input, output);
+ break;
+ }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ {
+ /* Up-conversion U8 -> F16 */
+ execute_window_loop(
+ win,
+ [&](const Coordinates &) {
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+ const int16x8x2_t texels = {
+ {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+ vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+ vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
+ vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) & true_val);
+ }
+ },
+ input, output);
+ break;
+ }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::U8:
+ {
+ /* Conversion U8 -> S8 */
+ execute_window_loop(
+ win,
+ [&](const Coordinates &) {
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+ vst1q_u8(output_ptr + x, vandq_u8(texels_u8, vdupq_n_u8(true_val)));
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) & true_val);
+ }
+ },
+ input, output);
+ break;
+ }
+ case DataType::U16:
+ {
+ /* Up-conversion U8 -> U16 */
+ execute_window_loop(
+ win,
+ [&](const Coordinates &) {
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
+
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+ const uint16x8x2_t texels = {{vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool)),
+ vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool))}};
+
+ vst1q_u16(output_ptr + x, texels.val[0]);
+ vst1q_u16(output_ptr + x + 8, texels.val[1]);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x) & true_val);
+ }
+ },
+ input, output);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Output data type not supported");
+ }
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
index 091d38c56..3ad9ee945 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
@@ -47,10 +47,13 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
using namespace arm_compute;
NEEmbeddingLookupKernel::NEEmbeddingLookupKernel()
- : _input(nullptr), _lookups(nullptr), _output(nullptr)
+ : _input(nullptr), _lookups(nullptr), _output(nullptr)
{
}
@@ -79,8 +82,8 @@ Status NEEmbeddingLookupKernel::validate(const arm_compute::ITensorInfo *input,
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
- input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
- DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
@@ -119,16 +122,17 @@ void NEEmbeddingLookupKernel::run(const Window &window, const ThreadInfo &info)
{
Iterator output_it(_output, out_slice);
- execute_window_loop(out_slice,
- [&](const Coordinates &id) {
- const int32_t lookup = *reinterpret_cast<int32_t *>(
- _lookups->ptr_to_element(Coordinates{id[lookup_dim]}));
- Coordinates input_id{id};
- input_id.set(lookup_dim, lookup);
- memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
- _output->info()->dimension(0) * _output->info()->element_size());
- },
- output_it);
+ execute_window_loop(
+ out_slice,
+ [&](const Coordinates &id) {
+ const int32_t lookup =
+ *reinterpret_cast<int32_t *>(_lookups->ptr_to_element(Coordinates{id[lookup_dim]}));
+ Coordinates input_id{id};
+ input_id.set(lookup_dim, lookup);
+ memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
+ _output->info()->dimension(0) * _output->info()->element_size());
+ },
+ output_it);
} while (window.slide_window_slice_4D(out_slice));
}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
new file mode 100644
index 000000000..375fa28e5
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <mutex>
+
+using namespace arm_compute;
+
+namespace
+{
+inline Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(accum);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != accum->dimension(0));
+
+ return Status{};
+}
+
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *accum,
+ ITensorInfo *biases)
+{
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
+
+ bool window_changed = update_window_and_padding(
+ win, AccessWindowHorizontal(accum, 0, num_elems_processed_per_iteration),
+ AccessWindowStatic(biases, 0, 0,
+ ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration),
+ biases->tensor_shape().y()));
+
+ AccessWindowHorizontal output_access(accum, 0, num_elems_processed_per_iteration);
+
+ // Set the valid region for the accum tensor
+ Coordinates coord;
+ coord.set_num_dimensions(accum->num_dimensions());
+ output_access.set_valid_region(win, ValidRegion(coord, accum->tensor_shape()));
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+NEGEMMMatrixAccumulateBiasesKernel::NEGEMMMatrixAccumulateBiasesKernel()
+ : _accum(nullptr), _biases(nullptr)
+{
+}
+
+void NEGEMMMatrixAccumulateBiasesKernel::configure(ITensor *accum, const ITensor *biases)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases);
+
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info()));
+
+ _biases = biases;
+ _accum = accum;
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(accum->info(), biases->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
+
+Status NEGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum,
+ const ITensorInfo *biases)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(accum->clone().get(), biases->clone().get()).first);
+
+ return Status{};
+}
+
+std::mutex m;
+void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window, const ThreadInfo &info)
+{
+ std::lock_guard<std::mutex> lock_guard(m);
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ Window win_biases;
+ win_biases.set(Window::DimX,
+ Window::Dimension(window.x().start(), window.x().end(), window.x().step()));
+ win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ Iterator in0_out(_accum, window);
+ Iterator in1(_biases, win_biases);
+
+ switch (_accum->info()->data_type())
+ {
+ case DataType::F32:
+ {
+ execute_window_loop(
+ window,
+ [&](const Coordinates &) {
+ const float32x4x4_t accum = vld4q_f32(reinterpret_cast<const float *>(in0_out.ptr()));
+ const float32x4x4_t biases = vld4q_f32(reinterpret_cast<const float *>(in1.ptr()));
+ const float32x4x4_t res = {
+ {vaddq_f32(accum.val[0], biases.val[0]), vaddq_f32(accum.val[1], biases.val[1]),
+ vaddq_f32(accum.val[2], biases.val[2]), vaddq_f32(accum.val[3], biases.val[3])}};
+
+ vst4q_f32(reinterpret_cast<float *>(in0_out.ptr()), res);
+ },
+ in0_out, in1);
+ break;
+ }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ {
+ execute_window_loop(
+ window,
+ [&](const Coordinates &) {
+ const float16x8x2_t accum = vld2q_f16(reinterpret_cast<const float16_t *>(in0_out.ptr()));
+ const float16x8x2_t biases = vld2q_f16(reinterpret_cast<const float16_t *>(in1.ptr()));
+ const float16x8x2_t res = {
+ {vaddq_f16(accum.val[0], biases.val[0]), vaddq_f16(accum.val[1], biases.val[1])}};
+
+ vst2q_f16(reinterpret_cast<float16_t *>(in0_out.ptr()), res);
+ },
+ in0_out, in1);
+ break;
+ }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
+ }
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
index 4c0a5e799..d4144e6b9 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
@@ -40,7 +40,7 @@
#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
#include "arm_compute/core/Coordinates.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -50,6 +50,9 @@
#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
namespace arm_compute
{
namespace
@@ -70,7 +73,10 @@ template <typename U> void validate_indices(const ITensor *indices)
} // namespace
-NEGatherKernelEx::NEGatherKernelEx() : _input{}, _indices{}, _axis{}, _output{}, _func{} {}
+NEGatherKernelEx::NEGatherKernelEx()
+ : _input{}, _indices{}, _axis{}, _indices_rank{}, _output{}, _func{}
+{
+}
template <typename U>
inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadInfo &info)
@@ -82,36 +88,35 @@ inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadIn
Iterator output_it(_output, window);
execute_window_loop(
- window,
- [&](const Coordinates &id) {
- Coordinates gather_id(id);
- gather_id.collapse(_indices->info()->num_dimensions(), 0);
-
- U new_index;
- switch (_indices->info()->num_dimensions())
- {
- case 1:
- new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0]))));
- break;
- case 2:
- new_index =
- *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1]))));
- break;
- case 3:
- new_index = *(
- reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2]))));
- break;
- default:
- ARM_COMPUTE_ERROR("Wrong num of dimensions");
- break;
- }
-
- gather_id.set(0, new_index);
-
- std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(),
- output_it.ptr());
- },
- output_it);
+ window,
+ [&](const Coordinates &id) {
+ Coordinates gather_id(id);
+ gather_id.collapse(_indices_rank);
+
+ U new_index;
+ switch (_indices_rank)
+ {
+ case 1:
+ new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0]))));
+ break;
+ case 2:
+ new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1]))));
+ break;
+ case 3:
+ new_index =
+ *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2]))));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Wrong num of dimensions");
+ break;
+ }
+
+ gather_id.set(0, new_index);
+
+ std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(),
+ output_it.ptr());
+ },
+ output_it);
}
template <typename U>
@@ -127,37 +132,36 @@ void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &inf
Iterator output_it(_output, output_window);
execute_window_loop(
- output_window,
- [&](const Coordinates &id) {
- Coordinates gather_id(id);
- gather_id.collapse(_indices->info()->num_dimensions(), _axis);
-
- U new_index;
- switch (_indices->info()->num_dimensions())
- {
- case 1:
- new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis]))));
- break;
- case 2:
- new_index = *(reinterpret_cast<U *>(
- _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1]))));
- break;
- case 3:
- new_index = *(reinterpret_cast<U *>(
- _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2]))));
- break;
- default:
- ARM_COMPUTE_ERROR("Wrong num of dimensions");
- break;
- }
-
- gather_id.set(_axis, new_index);
-
- std::copy_n(_input->ptr_to_element(gather_id),
- _input->info()->dimension(0) * _output->info()->element_size(),
- output_it.ptr());
- },
- output_it);
+ output_window,
+ [&](const Coordinates &id) {
+ Coordinates gather_id(id);
+ gather_id.collapse(_indices_rank, _axis);
+
+ U new_index;
+ switch (_indices_rank)
+ {
+ case 1:
+ new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis]))));
+ break;
+ case 2:
+ new_index = *(
+ reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1]))));
+ break;
+ case 3:
+ new_index = *(reinterpret_cast<U *>(
+ _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2]))));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Wrong num of dimensions");
+ break;
+ }
+
+ gather_id.set(_axis, new_index);
+
+ std::copy_n(_input->ptr_to_element(gather_id),
+ _input->info()->dimension(0) * _output->info()->element_size(), output_it.ptr());
+ },
+ output_it);
}
void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, ITensor *output,
@@ -167,13 +171,14 @@ void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, I
ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() > 3);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
- input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
- DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
_input = input;
_indices = indices;
_output = output;
_axis = axis;
+ _indices_rank = indices->info()->num_dimensions();
if (_axis < 0)
{
@@ -213,7 +218,7 @@ void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, I
}
// Output auto initialization if not yet initialized
TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
- input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
+ input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
// Create window
@@ -239,15 +244,15 @@ Status NEGatherKernelEx::validate(const ITensorInfo *input, const ITensorInfo *i
ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions()));
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
- input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
- DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
- input->tensor_shape(), indices->tensor_shape(), axis);
+ input->tensor_shape(), indices->tensor_shape(), axis);
ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
index 30787c0a4..f178865b7 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
@@ -47,6 +47,9 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
#include <unordered_map>
using namespace arm_compute;
@@ -57,7 +60,7 @@ constexpr size_t NOT_HIT = 0xFFFFFFFF;
} // namespace
NEHashtableLookupKernel::NEHashtableLookupKernel()
- : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr}
+ : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr}
{
}
@@ -66,7 +69,7 @@ void NEHashtableLookupKernel::configure(const ITensor *lookups, const ITensor *k
{
ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
ARM_COMPUTE_ERROR_THROW_ON(
- validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
+ validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
_lookups = lookups;
_keys = keys;
@@ -92,8 +95,8 @@ Status NEHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITens
{
ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
- input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
- DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
@@ -134,8 +137,8 @@ void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info)
const size_t lookup_dim = _output->info()->num_dimensions() - 1;
const int const_0 = _output->info()->data_type() == DataType::QASYMM8
- ? _output->info()->quantization_info().uniform().offset
- : 0;
+ ? _output->info()->quantization_info().uniform().offset
+ : 0;
std::unordered_map<int32_t, size_t> key_index_map;
for (size_t n = 0; n < _keys->info()->dimension(0); ++n)
@@ -174,24 +177,24 @@ void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info)
{
Iterator output_it(_output, out_slice);
- execute_window_loop(out_slice,
- [&](const Coordinates &id) {
- const auto lookup = lookup_indices.at(id[lookup_dim]);
- if (lookup == NOT_HIT)
- {
- memset(output_it.ptr(), const_0,
- _output->info()->dimension(0) * _output->info()->element_size());
- }
- else
- {
- Coordinates input_id{id};
- input_id.set(lookup_dim, lookup);
- memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
- _output->info()->dimension(0) * _output->info()->element_size());
- }
-
- },
- output_it);
+ execute_window_loop(
+ out_slice,
+ [&](const Coordinates &id) {
+ const auto lookup = lookup_indices.at(id[lookup_dim]);
+ if (lookup == NOT_HIT)
+ {
+ memset(output_it.ptr(), const_0,
+ _output->info()->dimension(0) * _output->info()->element_size());
+ }
+ else
+ {
+ Coordinates input_id{id};
+ input_id.set(lookup_dim, lookup);
+ memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
+ _output->info()->dimension(0) * _output->info()->element_size());
+ }
+ },
+ output_it);
} while (window.slide_window_slice_4D(out_slice));
}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
index 49adf1462..7804f9c6a 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
@@ -40,17 +40,22 @@
#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h"
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/INEKernel.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
#include <arm_neon.h>
namespace arm_compute
@@ -63,7 +68,7 @@ void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma
{
/** NEON vector tag type. */
using ExactTagType =
- typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+ typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
// Clear X/Y dimensions on execution window as we handle the planes manually
Window win = window;
@@ -73,107 +78,107 @@ void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma
constexpr int window_step_x = 16 / sizeof(T);
const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
const auto channel_idx =
- get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+ get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
Iterator input_it(input, win);
execute_window_loop(
- win,
- [&](const Coordinates &id) {
- Window win_plane = window;
- win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
- win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
- win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
-
- Iterator input_plane_it(input, win_plane);
- Iterator output_plane_it(output, win_plane);
-
- auto sum_h_w = static_cast<T>(0.f);
- auto sum_squares_h_w = static_cast<T>(0.f);
-
- execute_window_loop(
- win_plane,
- [&](const Coordinates &) {
- const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
-
- auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
- auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-
- // Compute S elements per iteration
- int x = window.x().start();
- for (; x <= (window.x().end() - window_step_x); x += window_step_x)
- {
- auto vec_input_val = wrapper::vloadq(input_ptr + x);
- vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val);
- vec_sum_squares_h_w =
- wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val));
- }
-
- auto vec2_sum_h_w =
- wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
- auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w),
- wrapper::vgetlow(vec_sum_squares_h_w));
- for (int i = 0; i < window_step_x / 4; ++i)
- {
- vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
- vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
- }
- sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
- sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
-
- // Compute left-over elements
- for (; x < window.x().end(); ++x)
- {
- const auto value = *(input_ptr + x);
- sum_h_w += value;
- sum_squares_h_w += value * value;
- }
- },
- input_plane_it, output_plane_it);
-
- const auto mean_h_w = sum_h_w / elements_plane;
- const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
-
- auto gamma_val = 1.0f;
- if (gamma != nullptr)
- {
- gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]}));
- }
- const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon);
- const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{});
- const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{});
- auto beta_val = 0.0f;
- if (beta != nullptr)
- {
- beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]}));
- }
- const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{});
-
- execute_window_loop(
- win_plane,
- [&](const Coordinates &) {
- auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr());
- auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
-
- // Compute S elements per iteration
- int x = window.x().start();
- auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
- for (; x <= (window.x().end() - window_step_x); x += window_step_x)
- {
- vec_val = wrapper::vloadq(input_ptr + x);
- vec_val = wrapper::vadd(
- wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta);
- wrapper::vstore(output_ptr + x, vec_val);
- }
-
- // Compute left-over elements
- for (; x < window.x().end(); ++x)
- {
- *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val;
- }
- },
- input_plane_it, output_plane_it);
- },
- input_it);
+ win,
+ [&](const Coordinates &id) {
+ Window win_plane = window;
+ win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+ win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+ win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
+
+ Iterator input_plane_it(input, win_plane);
+ Iterator output_plane_it(output, win_plane);
+
+ auto sum_h_w = static_cast<T>(0.f);
+ auto sum_squares_h_w = static_cast<T>(0.f);
+
+ execute_window_loop(
+ win_plane,
+ [&](const Coordinates &) {
+ const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
+
+ auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+
+ // Compute S elements per iteration
+ int x = window.x().start();
+ for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+ {
+ auto vec_input_val = wrapper::vloadq(input_ptr + x);
+ vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val);
+ vec_sum_squares_h_w =
+ wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val));
+ }
+
+ auto vec2_sum_h_w =
+ wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+ auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w),
+ wrapper::vgetlow(vec_sum_squares_h_w));
+ for (int i = 0; i < window_step_x / 4; ++i)
+ {
+ vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+ vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+ }
+ sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+ sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+ // Compute left-over elements
+ for (; x < window.x().end(); ++x)
+ {
+ const auto value = *(input_ptr + x);
+ sum_h_w += value;
+ sum_squares_h_w += value * value;
+ }
+ },
+ input_plane_it, output_plane_it);
+
+ const auto mean_h_w = sum_h_w / elements_plane;
+ const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+ auto gamma_val = 1.0f;
+ if (gamma != nullptr)
+ {
+ gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]}));
+ }
+ const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon);
+ const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{});
+ const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{});
+ auto beta_val = 0.0f;
+ if (beta != nullptr)
+ {
+ beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]}));
+ }
+ const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{});
+
+ execute_window_loop(
+ win_plane,
+ [&](const Coordinates &) {
+ auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr());
+ auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
+
+ // Compute S elements per iteration
+ int x = window.x().start();
+ auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
+ for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+ {
+ vec_val = wrapper::vloadq(input_ptr + x);
+ vec_val = wrapper::vadd(
+ wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta);
+ wrapper::vstore(output_ptr + x, vec_val);
+ }
+
+ // Compute left-over elements
+ for (; x < window.x().end(); ++x)
+ {
+ *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val;
+ }
+ },
+ input_plane_it, output_plane_it);
+ },
+ input_it);
}
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
@@ -199,8 +204,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index(
- input->data_layout(), DataLayoutDimension::CHANNEL)) !=
- gamma->dimension(0),
+ input->data_layout(), DataLayoutDimension::CHANNEL)) !=
+ gamma->dimension(0),
"Gamma's size must be the same as size of input's channel");
}
@@ -208,8 +213,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index(
- input->data_layout(), DataLayoutDimension::CHANNEL)) !=
- beta->dimension(0),
+ input->data_layout(), DataLayoutDimension::CHANNEL)) !=
+ beta->dimension(0),
"Beta's size must be the same as size of input's channel");
}
@@ -234,8 +239,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
} // namespace
NEInstanceNormalizationLayerKernelEx::NEInstanceNormalizationLayerKernelEx()
- : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr),
- _epsilon(1e-12)
+ : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr),
+ _epsilon(1e-12)
{
}
@@ -251,7 +256,7 @@ void NEInstanceNormalizationLayerKernelEx::configure(ITensor *input, ITensor *ou
_epsilon = epsilon;
ARM_COMPUTE_ERROR_THROW_ON(
- validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon));
+ validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon));
if (_input->info()->data_type() == DataType::F32)
{
@@ -282,7 +287,7 @@ Status NEInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input,
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
- input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+ input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
return Status{};
}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
index b92130cec..8ad998313 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
@@ -42,13 +42,15 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/WindowHelpers.h"
#include <arm_neon.h>
@@ -123,15 +125,17 @@ inline float32x4x4_t multiply_scale_vec(const int32x4x4_t &iv, float scale)
const float32x4_t vscale = vdupq_n_f32(scale);
const float32x4x4_t ret = {{
- vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale),
- vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale),
+ vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale),
+ vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale),
+ vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale),
+ vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale),
}};
return ret;
}
} // namespace
NEMultiplyScaleFactorKernel::NEMultiplyScaleFactorKernel()
- : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
+ : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
{
}
@@ -140,7 +144,7 @@ void NEMultiplyScaleFactorKernel::configure(const ITensor *input, const ITensor
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(
- validate_arguments(input->info(), scale_factor->info(), output->info()));
+ validate_arguments(input->info(), scale_factor->info(), output->info()));
_input = input;
_scale_factor = scale_factor;
@@ -180,25 +184,25 @@ template <typename T> void NEMultiplyScaleFactorKernel::multiply(const Window &w
Iterator output(_output, win_collapsed);
win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
execute_window_loop(
- win_collapsed,
- [&](const Coordinates &id) {
- auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()}));
- scale *= _multiplier;
-
- const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
- auto output_ptr = reinterpret_cast<T *>(output.ptr());
- int x = window_start_x;
- for (; x <= (window_end_x - window_step); x += window_step)
- {
- store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale));
- }
- // Compute left-over elements
- for (; x < window_end_x; ++x)
- {
- output_ptr[x] = input_ptr[x] * scale;
- }
- },
- input, output);
+ win_collapsed,
+ [&](const Coordinates &id) {
+ auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()}));
+ scale *= _multiplier;
+
+ const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
+ auto output_ptr = reinterpret_cast<T *>(output.ptr());
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step); x += window_step)
+ {
+ store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale));
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ output_ptr[x] = input_ptr[x] * scale;
+ }
+ },
+ input, output);
}
void NEMultiplyScaleFactorKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
new file mode 100644
index 000000000..04eb407e9
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEOneHotKernel.h"
+#include "src/core/CPP/Validate.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
+namespace arm_compute
+{
+namespace
+{
+/** Validate the depth
+ *
+ * Validate that depth are not negative
+ *
+ * @param[in] depth Depth tensor.
+ * @param[in] output Output tensor.
+ * @param[in] axis Axis of depth.
+ */
+template <typename U> void validate_depth(const ITensor *depth, const ITensor *output, int axis)
+{
+ ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(depth->buffer())) < 0);
+ ARM_COMPUTE_ERROR_ON(static_cast<U>(output->info()->tensor_shape()[axis]) !=
+ *(reinterpret_cast<U *>(depth->buffer())));
+}
+
+Status validate_arguments(const ITensorInfo *indices, const ITensorInfo *depth,
+ const ITensorInfo *on_value, const ITensorInfo *off_value,
+ const ITensorInfo *output, int axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(indices, depth, on_value, off_value, output);
+ const int actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions()));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+ ARM_COMPUTE_RETURN_ERROR_ON(on_value->tensor_shape().total_size() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(0 > actual_axis ||
+ actual_axis >= static_cast<int>(output->num_dimensions()));
+ ARM_COMPUTE_RETURN_ERROR_ON(on_value->data_type() == DataType::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(on_value, 1, DataType::U8, DataType::S8,
+ DataType::U16, DataType::S16, DataType::F16,
+ DataType::U32, DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value);
+ if (output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output);
+ }
+
+ return Status{};
+}
+
+template <typename U, typename Enable = void> bool isOnValue(U) { return true; }
+
+template <typename U, std::enable_if_t<std::is_integral<U>::value, int> = 0>
+bool isOnValue(U index, U depth)
+{
+ return index >= 0 && index < depth;
+}
+} // namespace
+
+NEOneHotKernel::NEOneHotKernel()
+ : _indices{nullptr}, _depth{nullptr}, _on_value{nullptr}, _off_value{nullptr}, _axis{-1},
+ _output{nullptr}, _func{}
+{
+}
+
+template <typename U>
+void NEOneHotKernel::onehot_0_axis(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ // Validate that the depth are not negative
+ validate_depth<U>(_depth, _output, _axis);
+ Window output_window{window};
+ output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+ Iterator output_it(_output, output_window);
+ const U off_value = *reinterpret_cast<U *>(_off_value->buffer());
+ execute_window_loop(
+ output_window,
+ [&](const Coordinates &id) {
+ std::fill_n(output_it.ptr(), _output->info()->dimension(0) * _output->info()->element_size(),
+ off_value);
+ Coordinates indices_id(id);
+ indices_id.remove(0);
+ const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id)));
+ if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer()))))
+ {
+ Coordinates onehot_id(id);
+ onehot_id.set(0, new_index);
+ std::copy_n(_on_value->buffer(), _output->info()->element_size(),
+ _output->ptr_to_element(onehot_id));
+ }
+ },
+ output_it);
+}
+
+template <typename U>
+inline void NEOneHotKernel::onehot_n_axis(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ // Validate that the indices are not negative
+ validate_depth<U>(_depth, _output, _axis);
+ Iterator output_it(_output, window);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id) {
+ Coordinates indices_id(id);
+ indices_id.remove(_axis);
+ const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id)));
+ if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer()))))
+ {
+ Coordinates onehot_id(id);
+ onehot_id.set(_axis, new_index);
+ std::copy_n(static_cast<U>(id[_axis]) == new_index ? _on_value->buffer()
+ : _off_value->buffer(),
+ _output->info()->element_size(), output_it.ptr());
+ }
+ },
+ output_it);
+}
+
+void NEOneHotKernel::configure(const ITensor *indices, const ITensor *depth,
+ const ITensor *on_value, const ITensor *off_value, ITensor *output,
+ int axis)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(indices, depth, on_value, off_value, output);
+ ARM_COMPUTE_ERROR_ON(output->info()->total_size() == 0);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(indices->info(), depth->info(), on_value->info(),
+ off_value->info(), output->info(), axis));
+ _indices = indices;
+ _depth = depth;
+ _on_value = on_value;
+ _off_value = off_value;
+ _output = output;
+ _axis = wrap_around(axis, static_cast<int>(output->info()->num_dimensions()));
+ if (0 == _axis)
+ {
+ switch (_indices->info()->data_type())
+ {
+ case DataType::U32:
+ _func = &NEOneHotKernel::onehot_0_axis<uint32_t>;
+ break;
+ case DataType::S32:
+ _func = &NEOneHotKernel::onehot_0_axis<int32_t>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
+ }
+ else
+ {
+ switch (_indices->info()->data_type())
+ {
+ case DataType::U32:
+ _func = &NEOneHotKernel::onehot_n_axis<uint32_t>;
+ break;
+ case DataType::S32:
+ _func = &NEOneHotKernel::onehot_n_axis<int32_t>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
+ }
+ // Create window
+ Window win = calculate_max_window(*output->info(), Steps());
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ INEKernel::configure(win);
+}
+
+Status NEOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *depth,
+ const ITensorInfo *on_value, const ITensorInfo *off_value,
+ const ITensorInfo *output, int axis)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments(indices, depth, on_value, off_value, output, axis));
+ return Status{};
+}
+
+void NEOneHotKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON(_func == nullptr);
+ (this->*_func)(window, info);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
index 5841f1d69..420e5063c 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
@@ -42,13 +42,16 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/INEKernel.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/WindowHelpers.h"
#include <arm_neon.h>
@@ -107,19 +110,15 @@ inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv,
const int32x4x4_t rf = {{
#ifdef __aarch64__
- vminq_s32(vposend,
- vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
- vminq_s32(vposend,
- vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
- vminq_s32(vposend,
- vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
- vminq_s32(vposend,
- vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
+ vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
+ vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
+ vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
+ vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
#else //__aarch64__
- vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
- vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
- vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
- vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
+ vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
+ vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
+ vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
+ vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
#endif //__aarch64__
}};
const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
@@ -129,7 +128,7 @@ inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv,
} // namespace
NEQuantizationSymmetricKernel::NEQuantizationSymmetricKernel()
- : _input(nullptr), _output(nullptr), _scale_factor(nullptr)
+ : _input(nullptr), _output(nullptr), _scale_factor(nullptr)
{
}
@@ -138,7 +137,7 @@ void NEQuantizationSymmetricKernel::configure(const ITensor *input, ITensor *out
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(
- validate_arguments(input->info(), output->info(), scale_factor->info()));
+ validate_arguments(input->info(), output->info(), scale_factor->info()));
_input = input;
_output = output;
@@ -182,40 +181,40 @@ template <typename T> void NEQuantizationSymmetricKernel::quantize(const Window
const auto dim_x = _input->info()->dimension(0);
win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
execute_window_loop(
- win_collapsed,
- [&](const Coordinates &id) {
- const auto start = reinterpret_cast<const T *>(input.ptr());
- const auto min_max = std::minmax_element(start, start + dim_x);
- const auto int8_scale = 127;
- auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second));
- if (range == 0)
- {
- *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1;
- range = 1;
- }
- else
- {
- *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale;
- }
- const auto scale_factor_inv = int8_scale / range;
-
- auto input_ptr = reinterpret_cast<const T *>(input.ptr());
- auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
- int x = window_start_x;
- for (; x <= (window_end_x - window_step); x += window_step)
- {
- wrapper::vstore(&output_ptr[x],
- vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale));
- }
- // Compute left-over elements
- for (; x < window_end_x; ++x)
- {
- int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy);
- quantized = std::min(int8_scale, std::max(quantized, -int8_scale));
- output_ptr[x] = static_cast<int8_t>(quantized);
- }
- },
- input, output);
+ win_collapsed,
+ [&](const Coordinates &id) {
+ const auto start = reinterpret_cast<const T *>(input.ptr());
+ const auto min_max = std::minmax_element(start, start + dim_x);
+ const auto int8_scale = 127;
+ auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second));
+ if (range == 0)
+ {
+ *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1;
+ range = 1;
+ }
+ else
+ {
+ *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale;
+ }
+ const auto scale_factor_inv = int8_scale / range;
+
+ auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+ auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step); x += window_step)
+ {
+ wrapper::vstore(&output_ptr[x],
+ vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale));
+ }
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy);
+ quantized = std::min(int8_scale, std::max(quantized, -int8_scale));
+ output_ptr[x] = static_cast<int8_t>(quantized);
+ }
+ },
+ input, output);
}
void NEQuantizationSymmetricKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp
deleted file mode 100644
index 3b65eac10..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp
+++ /dev/null
@@ -1,693 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace
-{
-// Helper function to calculate the minimum value of the input vector. All the elements in the
-// output vector contain the min value.
-float32x2_t calculate_min(float32x4_t in)
-{
- auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
- return wrapper::vpmin(pmin, pmin);
-}
-
-// Helper function to calculate the maximum value of the input vector. All the elements in the
-// output vector contain the max value.
-float32x2_t calculate_max(float32x4_t in)
-{
- auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
- return wrapper::vpmax(pmax, pmax);
-}
-// Helper function to calculate the minimum value of the input vector. All the elements in the
-// output vector contain the min value.
-int32x2_t calculate_min(int32x4_t in)
-{
- auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
- return wrapper::vpmin(pmin, pmin);
-}
-
-// Helper function to calculate the maximum value of the input vector. All the elements in the
-// output vector contain the max value.
-int32x2_t calculate_max(int32x4_t in)
-{
- auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
- return wrapper::vpmax(pmax, pmax);
-}
-
-// Helper function to calculate the minimum value of the input vector. All the elements in the
-// output vector contain the min value.
-inline uint8x8_t calculate_min(uint8x16_t in)
-{
- auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
- pmin = wrapper::vpmin(pmin, pmin);
- pmin = wrapper::vpmin(pmin, pmin);
- return wrapper::vpmin(pmin, pmin);
-}
-// Helper function to calculate the maximum value of the input vector. All the elements in the
-// output vector contain the max value.
-inline uint8x8_t calculate_max(uint8x16_t in)
-{
- auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
- pmax = wrapper::vpmax(pmax, pmax);
- pmax = wrapper::vpmax(pmax, pmax);
- return wrapper::vpmax(pmax, pmax);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-// Helper function to calculate the minimum value of the input vector. All the elements in the
-// output vector contain the min value.
-inline float16x4_t calculate_min(float16x8_t in)
-{
- auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
- pmin = wrapper::vpmin(pmin, pmin);
- return wrapper::vpmin(pmin, pmin);
-}
-// Helper function to calculate the maximum value of the input vector. All the elements in the
-// output vector contain the max value.
-inline float16x4_t calculate_max(float16x8_t in)
-{
- auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
- pmax = wrapper::vpmax(pmax, pmax);
- return wrapper::vpmax(pmax, pmax);
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-template <class F> class Reducer
-{
-public:
- static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f,
- const ReduceOperation op)
- {
- // Set out window
- Window out_window(window);
- out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-
- // Get first input and output slices
- Window in_slice = window.first_slice_window_1D();
- Window out_slice = out_window.first_slice_window_1D();
-
- do
- {
- Iterator in(input, in_slice);
- Iterator out(output, out_slice);
-
- f(in, out, in_slice, out_slice, *input->info(), op);
- } while (window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
- }
- static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f,
- const ReduceOperation op)
- {
- // Set in window
- Window in_window(window);
- Window out_window(window);
-
- in_window.set(Window::DimY, Window::Dimension(0, 1, 1));
- out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1),
- output->info()->dimension(1)));
-
- // Get first input and output slices
- Window in_slice = in_window.first_slice_window_2D();
- Window out_slice = out_window.first_slice_window_2D();
-
- do
- {
- Iterator in(input, in_slice);
- Iterator out(output, out_slice);
-
- f(in, out, in_slice, out_slice, *input->info(), 1, op);
- } while (in_window.slide_window_slice_2D(in_slice) &&
- out_window.slide_window_slice_2D(out_slice));
- }
- static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f,
- const ReduceOperation op)
- {
- // Set in window
- Window in_window(window);
- Window out_window(window);
-
- in_window.set(Window::DimZ, Window::Dimension(0, 1, 1));
- out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2),
- output->info()->dimension(2)));
-
- // Get first input and output slices
- Window in_slice = in_window.first_slice_window_3D();
- Window out_slice = out_window.first_slice_window_3D();
-
- do
- {
- Iterator in(input, in_slice);
- Iterator out(output, out_slice);
-
- f(in, out, in_slice, out_slice, *input->info(), 2, op);
- } while (in_window.slide_window_slice_3D(in_slice) &&
- out_window.slide_window_slice_3D(out_slice));
- }
- static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f,
- const ReduceOperation op)
- {
- // Set in/out window
- Window in_window(window);
- Window out_window(window);
-
- in_window.set(3, Window::Dimension(0, 1, 1));
- out_window.set(3, Window::Dimension(0, 1, 1));
-
- // Get first input and output slices
- Window in_slice = in_window.first_slice_window_4D();
- Window out_slice = out_window.first_slice_window_4D();
-
- do
- {
- Iterator in(input, in_slice);
- Iterator out(output, out_slice);
-
- f(in, out, in_slice, out_slice, *input->info(), 3, op);
- } while (in_window.slide_window_slice_4D(in_slice) &&
- out_window.slide_window_slice_4D(out_slice));
- }
-};
-
-template <typename T, int S> struct RedOpX
-{
- /** NEON vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
- inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
- const TensorInfo &in_info, const ReduceOperation op)
- {
- ARM_COMPUTE_UNUSED(out_slice);
- ARM_COMPUTE_UNUSED(in_info);
- auto init_res_value = static_cast<T>(0.f);
- switch (op)
- {
- case ReduceOperation::MIN:
- case ReduceOperation::MAX:
- {
- init_res_value = *reinterpret_cast<T *>(input.ptr());
- break;
- }
- default:
- break;
- }
- auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
-
- execute_window_loop(in_slice,
- [&](const Coordinates &) {
- const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
- const auto vec_elements = wrapper::vloadq(in_ptr);
-
- switch (op)
- {
- case ReduceOperation::MIN:
- {
- vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- break;
- }
- case ReduceOperation::MAX:
- {
- vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- },
- input);
-
- switch (op)
- {
- case ReduceOperation::MIN:
- {
- *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_min(vec_res_value), 0);
- break;
- }
- case ReduceOperation::MAX:
- {
- *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_max(vec_res_value), 0);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- }
-};
-
-struct RedOpX_qasymm8
-{
- inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
- const TensorInfo &in_info, const ReduceOperation op)
- {
- ARM_COMPUTE_UNUSED(out_slice);
- ARM_COMPUTE_UNUSED(in_info);
-
- uint8x16_t vec_res_value = {0};
-
- if (op == ReduceOperation::MIN || op == ReduceOperation::MAX)
- {
- vec_res_value = wrapper::vdup_n(*input.ptr(), wrapper::traits::vector_128_tag{});
- }
-
- execute_window_loop(in_slice,
- [&](const Coordinates &) {
- const auto vec_elements = wrapper::vloadq(input.ptr());
- switch (op)
- {
- case ReduceOperation::MIN:
- {
- vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- break;
- }
- case ReduceOperation::MAX:
- {
- vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- },
- input);
-
- switch (op)
- {
- case ReduceOperation::MIN:
- {
- *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
- break;
- }
- case ReduceOperation::MAX:
- {
- *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Not supported");
- }
- }
- }
-};
-
-template <typename T, int S> struct RedOpYZW
-{
- /** NEON vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
- using neon_vector = typename wrapper::traits::neon_vector<T, S>::type;
-
- inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
- const TensorInfo &in_info, int axis, const ReduceOperation op)
- {
- ARM_COMPUTE_UNUSED(out_slice);
-
- execute_window_loop(
- in_slice,
- [&](const Coordinates &) {
- neon_vector vec_res_value = {0};
- switch (op)
- {
- case ReduceOperation::MIN:
- case ReduceOperation::MAX:
- {
- vec_res_value = wrapper::vloadq(reinterpret_cast<T *>(input.ptr()));
- break;
- }
- default:
- {
- vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
- break;
- }
- }
-
- for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
- {
- T *in_ptr;
- switch (axis)
- {
- case 1:
- in_ptr = reinterpret_cast<T *>(
- input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim)));
- break;
- case 2:
- in_ptr = reinterpret_cast<T *>(
- input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim)));
- break;
- case 3:
- in_ptr = reinterpret_cast<T *>(
- input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim)));
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- const auto vec_elements = wrapper::vloadq(in_ptr);
-
- switch (op)
- {
- case ReduceOperation::MIN:
- {
- vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- break;
- }
- case ReduceOperation::MAX:
- {
- vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- }
- wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value);
- },
- input, output);
- }
-};
-
-struct RedOpYZW_qasymm8
-{
- inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice,
- const TensorInfo &in_info, int axis, const ReduceOperation op)
- {
- ARM_COMPUTE_UNUSED(out_slice);
-
- execute_window_loop(
- in_slice,
- [&](const Coordinates &) {
- auto vec_res_value = wrapper::vloadq(input.ptr());
-
- for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
- {
- uint8_t *in_ptr;
- switch (axis)
- {
- case 1:
- in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, index_dim));
- break;
- case 2:
- in_ptr =
- input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, index_dim));
- break;
- case 3:
- in_ptr =
- input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, index_dim));
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- const auto vec_elements = wrapper::vloadq(in_ptr);
-
- switch (op)
- {
- case ReduceOperation::MIN:
- {
- vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- break;
- }
- case ReduceOperation::MAX:
- {
- vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- }
- wrapper::vstore(reinterpret_cast<uint8_t *>(output.ptr()), vec_res_value);
- },
- input, output);
- }
-};
-
-void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis,
- const ReduceOperation op)
-{
- const bool is_complex = (input->info()->num_channels() == 2);
- if (is_complex)
- {
- ARM_COMPUTE_ERROR("Not supported");
- }
-
- switch (axis)
- {
- case 0:
- switch (input->info()->data_type())
- {
- case DataType::QASYMM8:
- return Reducer<RedOpX_qasymm8>::reduceX(window, input, output, RedOpX_qasymm8(), op);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output,
- RedOpX<float16_t, 8>(), op);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
- case DataType::S32:
- return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(),
- op);
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- case 1:
- switch (input->info()->data_type())
- {
- case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8>::reduceY(window, input, output, RedOpYZW_qasymm8(), op);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output,
- RedOpYZW<float16_t, 8>(), op);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(),
- op);
- case DataType::S32:
- return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output,
- RedOpYZW<int32_t, 4>(), op);
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- case 2:
- switch (input->info()->data_type())
- {
- case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8>::reduceZ(window, input, output, RedOpYZW_qasymm8(), op);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output,
- RedOpYZW<float16_t, 8>(), op);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(),
- op);
- case DataType::S32:
- return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output,
- RedOpYZW<int32_t, 4>(), op);
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- case 3:
- switch (input->info()->data_type())
- {
- case DataType::QASYMM8:
- return Reducer<RedOpYZW_qasymm8>::reduceW(window, input, output, RedOpYZW_qasymm8(), op);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output,
- RedOpYZW<float16_t, 8>(), op);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(),
- op);
- case DataType::S32:
- return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output,
- RedOpYZW<int32_t, 4>(), op);
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported reduction axis");
- }
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis,
- ReduceOperation op)
-{
- ARM_COMPUTE_UNUSED(op);
-
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-
- if (input->num_channels() == 1)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32,
- DataType::F16, DataType::F32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_MSG("Not support complex");
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
- "Reduction axis greater than max number of dimensions");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
-
- if (output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels());
-
- const TensorShape output_shape =
- arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
- const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
- }
-
- return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
- unsigned int axis, ReduceOperation op)
-{
- ARM_COMPUTE_UNUSED(op);
-
- // Calculate output shape and set if empty
- const TensorShape output_shape =
- arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
-
- // Output auto initialization if not yet initialized
- DataType output_data_type = input->data_type();
- auto_init_if_empty(*output, input->clone()
- ->set_tensor_shape(output_shape)
- .set_data_type(output_data_type)
- .reset_padding()
- .set_is_resizable(true));
-
- unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
-
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- bool window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
- Status err = (window_changed)
- ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
- : Status{};
-
- return std::make_tuple(err, win);
-}
-} // namespace
-
-NEReductionOperationKernelEx::NEReductionOperationKernelEx()
- : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReduceOperation::MAX),
- _border_size()
-{
-}
-
-BorderSize NEReductionOperationKernelEx::border_size() const { return _border_size; }
-
-void NEReductionOperationKernelEx::configure(const ITensor *input, ITensor *output,
- unsigned int axis, ReduceOperation op)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
-
- unsigned int num_elems_processed_per_iteration =
- 16 / data_size_from_type(input->info()->data_type());
-
- _input = input;
- _output = output;
- _border_size =
- (axis == 0)
- ? BorderSize(0, num_elems_processed_per_iteration -
- (input->info()->dimension(0) % num_elems_processed_per_iteration),
- 0, 0)
- : BorderSize();
- _op = op;
- _reduction_axis = axis;
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op);
-
- ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
- INEKernel::configure(std::get<1>(win_config));
-}
-
-Status NEReductionOperationKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
- unsigned int axis, ReduceOperation op)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(
- validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op)));
-
- return Status{};
-}
-
-void NEReductionOperationKernelEx::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
- reduce_op(window, _input, _output, _reduction_axis, _op);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
new file mode 100644
index 000000000..6b9b0d4b4
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/runtime/Utils.h"
+
+namespace arm_compute
+{
+CLArgMinMaxLayerEx::CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(),
+ _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis()
+{
+}
+
+Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const ITensorInfo *output,
+ const ReductionOperation &op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX &&
+ op != ReductionOperation::ARG_IDX_MIN,
+ "Invalid reduction operation");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions),
+ "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+ const unsigned int num_of_stages =
+ utils::calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
+
+ DataType output_data_type = DataType::S32;
+ TensorInfo not_reshaped_output;
+ const auto input_num_channles = input->num_channels();
+ const auto input_qinfo = input->quantization_info();
+
+ if (output->total_size() != 0)
+ {
+ output_data_type = output->data_type();
+ const TensorInfo expected_output_shape =
+ output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(
+ input->tensor_shape(), axis, false));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
+ }
+
+ auto shape_before_reshape = input->tensor_shape();
+ shape_before_reshape.set(axis, 1);
+ auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type,
+ int num_channels, QuantizationInfo qinfo) {
+ ti.set_data_type(data_type)
+ .set_tensor_shape(shape)
+ .set_num_channels(num_channels)
+ .set_quantization_info(qinfo);
+ };
+
+ initialize_tensorinfo(not_reshaped_output, shape_before_reshape, output_data_type,
+ input_num_channles, input_qinfo);
+
+ if (num_of_stages == 1)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArgMinMaxLayerKernelEx::validate(input, nullptr, &not_reshaped_output, axis, op));
+ }
+ else
+ {
+ // Create temporary tensor infos
+ std::vector<TensorInfo> sums_vector(num_of_stages - 1);
+
+ // Create intermediate tensor info
+ TensorShape shape{input->tensor_shape()};
+
+ for (unsigned int i = 0; i < num_of_stages - 1; i++)
+ {
+ shape.set(0, ceil(shape.x() / 128.f));
+ sums_vector[i].set_data_type(input->data_type());
+ sums_vector[i].set_tensor_shape(shape);
+ sums_vector[i].set_num_channels(input->num_channels());
+ }
+
+ // Validate ReductionOperation only on first kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArgMinMaxLayerKernelEx::validate(input, nullptr, &sums_vector[0], axis, op));
+
+ // Validate ReductionOperation on intermediate stages
+ for (unsigned int i = 1; i < num_of_stages - 1; ++i)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLArgMinMaxLayerKernelEx::validate(input, &sums_vector[i - 1], &sums_vector[i], axis, op));
+ }
+
+ // Validate ReductionOperation on the last stage
+ const unsigned int last_stage = num_of_stages - 1;
+ ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(
+ input, &sums_vector[last_stage - 1], &not_reshaped_output, axis, op));
+ }
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&not_reshaped_output, output));
+ return Status{};
+}
+
+void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *output,
+ const ReductionOperation &op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ _num_of_stages = utils::calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
+ _reduction_axis = axis;
+
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(
+ input->info()->tensor_shape(), axis, false);
+ DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN)
+ ? DataType::S32
+ : output->info()->data_type();
+ auto_init_if_empty(*output->info(), input->info()
+ ->clone()
+ ->set_tensor_shape(output_shape)
+ .set_data_type(output_data_type)
+ .reset_padding()
+ .set_is_resizable(true));
+
+ // Configure reduction operation kernels
+ _reduction_kernels_vector.resize(_num_of_stages);
+
+ _memory_group.manage(&_not_reshaped_output);
+ // Create temporary tensors
+ if (_num_of_stages == 1)
+ {
+ // Force an early initialization for int64 output type
+ TensorShape output_shape{input->info()->tensor_shape()};
+ output_shape.set(axis, 1);
+ auto_init_if_empty(*_not_reshaped_output.info(), input->info()
+ ->clone()
+ ->set_tensor_shape(output_shape)
+ .set_data_type(output_data_type)
+ .reset_padding()
+ .set_is_resizable(true));
+ _not_reshaped_output.info()->set_tensor_shape(output_shape);
+ _reduction_kernels_vector[0].configure(input, nullptr, &_not_reshaped_output, axis, op);
+ }
+ else
+ {
+ _results_vector.resize(_num_of_stages - 1);
+ TensorShape shape{input->info()->tensor_shape()};
+ for (unsigned int i = 0; i < _num_of_stages - 1; i++)
+ {
+ shape.set(0, ceil(shape.x() / 128.f));
+ _results_vector[i].allocator()->init(
+ input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type));
+ }
+
+ // Apply ReductionOperation only on first kernel
+ _memory_group.manage(&_results_vector[0]);
+ _reduction_kernels_vector[0].configure(input, nullptr, &_results_vector[0], axis, op);
+
+ // Apply ReductionOperation on intermediate stages
+ for (unsigned int i = 1; i < _num_of_stages - 1; ++i)
+ {
+ _memory_group.manage(&_results_vector[i]);
+ _reduction_kernels_vector[i].configure(input, &_results_vector[i - 1], &_results_vector[i],
+ axis, op);
+ _results_vector[i - 1].allocator()->allocate();
+ }
+
+ // Apply ReductionOperation on the last stage
+ const unsigned int last_stage = _num_of_stages - 1;
+ _reduction_kernels_vector[last_stage].configure(input, &_results_vector[last_stage - 1],
+ &_not_reshaped_output, axis, op);
+ _results_vector[last_stage - 1].allocator()->allocate();
+ }
+ _reshape_kernel.configure(CLKernelLibrary::get().get_compile_context(), &_not_reshaped_output,
+ output);
+ _not_reshaped_output.allocator()->allocate();
+}
+
+void CLArgMinMaxLayerEx::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ for (unsigned int i = 0; i < _num_of_stages; ++i)
+ {
+ CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
+ }
+ _reshape_kernel.run();
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
index e5122ab8f..31c96b080 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
@@ -42,13 +42,14 @@
#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
using namespace arm_compute;
void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
BinaryLogicalOperation op)
{
- auto k = support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+ auto k = std::make_unique<CLBinaryLogicalOpKernel>();
k->configure(input1, input2, output, op);
_kernel = std::move(k);
@@ -57,7 +58,7 @@ void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTenso
ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
if (broadcasted_info->info()->dimension(0) == 1)
{
- _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+ _border_handler->configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
}
}
}
diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp
index 768c15b41..96f9c17a9 100644
--- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp
@@ -15,7 +15,7 @@
*/
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -38,17 +38,15 @@
* SOFTWARE.
*/
-#include "arm_compute/runtime/CPP/functions/CPPOneHotEx.h"
+#include "arm_compute/runtime/CL/functions/CLCastBool.h"
-#include "arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/core/CL/kernels/CLCastBoolKernel.h"
using namespace arm_compute;
-void CPPOneHotEx::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
- const ITensor *off_value, ITensor *output, const int axis)
+void CLCastBool::configure(ICLTensor *input, ICLTensor *output)
{
- auto k = support::cpp14::make_unique<CPPOneHotKernelEx>();
- k->configure(indices, depth, on_value, off_value, output, axis);
+ auto k = std::make_unique<CLCastBoolKernel>();
+ k->configure(input, output);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
index 3dede0562..464f60dee 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
@@ -45,6 +45,8 @@
#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
#include <memory>
#include <tuple>
@@ -53,16 +55,10 @@ namespace arm_compute
using namespace arm_compute::misc::shape_calculator;
CLDirectTransposeConvLayer::CLDirectTransposeConvLayer(
- std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
- : _memory_group(std::move(memory_manager)),
- _scale_f(),
- _conv_f(),
- _flip_weights(),
- _scaled_output(),
- _original_weights(nullptr),
- _weights_flipped(),
- _flip_axis(),
- _is_prepared(false)
+ std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)), _scale_f(), _conv_f(), _flip_weights(),
+ _scaled_output(), _original_weights(nullptr), _weights_flipped(), _flip_axis(),
+ _is_prepared(false)
{
}
@@ -74,7 +70,7 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
- input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+ input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
const DataLayout data_layout = input->data_layout();
@@ -86,8 +82,8 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
auto out_dims = transposeconv_output_dimensions(
- input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
- weights->dimension(idx_h), info, invalid_right, invalid_bottom);
+ input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
+ weights->dimension(idx_h), info, invalid_right, invalid_bottom);
const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
@@ -117,19 +113,19 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen
unsigned int pad_right = 0;
unsigned int pad_top = 0;
unsigned int pad_bottom = 0;
- const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
- *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
- pad_bottom);
+ const TensorShape scale_out_shape =
+ compute_transposeconv_upsampled_shape(*input, *weights, info, out_dims, invalid_right,
+ invalid_bottom, pad_left, pad_right, pad_top, pad_bottom);
TensorInfo scale_out_info(input->clone()
- ->set_is_resizable(true)
- .reset_padding()
- .set_tensor_shape(scale_out_shape)
- .set_data_layout(data_layout));
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(scale_out_shape)
+ .set_data_layout(data_layout));
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
- ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
- conv_info, weights_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
return Status{};
}
@@ -171,22 +167,22 @@ void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_conte
_flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
auto out_dims = transposeconv_output_dimensions(
- input->info()->dimension(idx_w), input->info()->dimension(idx_h),
- weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
- invalid_bottom);
+ input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+ weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
+ invalid_bottom);
const TensorShape output_shape =
- compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+ compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
// Output auto initialization if not yet initialized
auto_init_if_empty(
- *output->info(),
- input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+ *output->info(),
+ input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate(
- input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
- info, invalid_right, invalid_bottom));
+ input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info,
+ invalid_right, invalid_bottom));
_is_prepared = weights_info.retain_internal_weights();
@@ -195,8 +191,8 @@ void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_conte
// Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
// to match output shape
const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
- *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
- pad_right, pad_top, pad_bottom);
+ *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+ pad_right, pad_top, pad_bottom);
TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
input->info()->quantization_info());
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
index ae9d8afc6..003ec8042 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
@@ -39,7 +39,6 @@
*/
#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h"
-
#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
using namespace arm_compute;
@@ -47,7 +46,7 @@ using namespace arm_compute;
void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
const ICLTensor *lookups)
{
- auto k = support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+ auto k = std::make_unique<CLEmbeddingLookupKernel>();
k->configure(input, output, lookups);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
index 01989461e..af936e873 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
@@ -45,7 +45,6 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
#include <algorithm>
@@ -60,7 +59,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
ARM_COMPUTE_UNUSED(weights);
ARM_COMPUTE_UNUSED(output);
ARM_COMPUTE_RETURN_ON_ERROR(
- CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
+ CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
return Status{};
}
@@ -68,7 +67,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
{
- auto k = support::cpp14::make_unique<CLTransposeKernel>();
+ auto k = std::make_unique<CLTransposeKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
@@ -80,12 +79,12 @@ Status CLFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *in
}
CLFullyConnectedHybridLayer::CLFullyConnectedHybridLayer(
- std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(),
- _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(),
- _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(),
- _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false),
- _original_weights(nullptr)
+ std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(),
+ _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(),
+ _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(),
+ _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false),
+ _original_weights(nullptr)
{
}
void CLFullyConnectedHybridLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights,
@@ -107,8 +106,8 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen
// Perform validate step
ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedHybridLayer::validate(
- input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
- fc_info));
+ input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+ fc_info));
_are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
_accumulate_biases = false;
@@ -140,10 +139,10 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen
bool is_fc_after_conv = false;
if (is_batched_fc_layer)
{
- is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
- (std::equal(input->info()->tensor_shape().cbegin() + 3,
- input->info()->tensor_shape().cend(),
- output->info()->tensor_shape().cbegin() + 1));
+ is_fc_after_conv =
+ (TensorShape::num_max_dimensions >= 4) &&
+ (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(),
+ output->info()->tensor_shape().cbegin() + 1));
}
else
{
@@ -158,28 +157,28 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen
{
// Reshape the weights
_reshape_weights_output.allocator()->init(
- weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
- compute_transposed_shape(*weights->info())));
+ weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+ compute_transposed_shape(*weights->info())));
_reshape_weights_kernel.configure(weights_to_use, &_reshape_weights_output);
weights_to_use = &_reshape_weights_output;
}
// Extract scale factor
_scale_factor.allocator()->init(
- TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type()));
+ TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type()));
_memory_group.manage(&_scale_factor);
_scale_factor_kernel.configure(input, &_scale_factor);
// Quantize input
_quantized_input.allocator()->init(
- input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
- DataType::QASYMM8_SIGNED));
+ input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+ DataType::QASYMM8_SIGNED));
_memory_group.manage(&_quantized_input);
_quant_input_kernel.configure(input, &_scale_factor, &_quantized_input);
// GEMMLowp
_gemmlowp_output.allocator()->init(
- output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+ output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
_memory_group.manage(&_gemmlowp_output);
configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output,
fc_info.retain_internal_weights);
@@ -209,15 +208,15 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
const GPUTarget gpu_target = CLScheduler::get().target();
const ITensorInfo &reshaped_weights =
- TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
- compute_transposed_shape(*weights)));
+ TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+ compute_transposed_shape(*weights)));
// Configure accumulate biases kernel for non quantized asymmetric types
if (biases != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
ARM_COMPUTE_RETURN_ON_ERROR(
- CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target));
+ CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target));
}
// With the Fully Connected layer we can have 4 different cases:
@@ -247,33 +246,32 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
{
// Validate reshape weights kernel
ARM_COMPUTE_RETURN_ON_ERROR(
- CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
+ CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
weights_to_use = &reshaped_weights;
}
// Validate Scale factor kernel
const ITensorInfo &scale_factor =
- TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type());
+ TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor));
// Validate quantization symm8 kernel
- const ITensorInfo &quantized_input =
- TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
- DataType::QASYMM8_SIGNED));
+ const ITensorInfo &quantized_input = TensorInfo(
+ input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::QASYMM8_SIGNED));
ARM_COMPUTE_RETURN_ON_ERROR(
- CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input));
+ CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input));
// Fully Connected layer after a Fully Connected Layer without batches
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
// Validate matrix multiply kernel
const ITensorInfo &gemmlowp_output = TensorInfo(
- output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+ output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output));
// Multiply scale
ARM_COMPUTE_RETURN_ON_ERROR(
- CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output));
+ CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output));
return Status{};
}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
index 2ff4b9659..c6a88d340 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
@@ -42,11 +42,11 @@
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/Cast.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
+
+#include "support/Cast.h"
#include <algorithm>
@@ -79,7 +79,7 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorIn
int output_multiplier = 0;
int output_shift = 0;
ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(
- multiplier, &output_multiplier, &output_shift));
+ multiplier, &output_multiplier, &output_shift));
// Set the GEMMLowp output stage info
gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
@@ -99,7 +99,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
{
GEMMLowpOutputStageInfo gemmlowp_output_stage;
ARM_COMPUTE_RETURN_ON_ERROR(
- construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage));
+ construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage));
const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
false, // is_b_reshaped
@@ -125,14 +125,14 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
// Validate gemmlowp function
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(
- &input.clone()->set_quantization_info(input_quantization_info),
- &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output,
- gemm_info));
+ &input.clone()->set_quantization_info(input_quantization_info),
+ &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output,
+ gemm_info));
}
else
{
ARM_COMPUTE_RETURN_ON_ERROR(
- CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info));
+ CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info));
}
return Status{};
@@ -141,7 +141,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output)
{
- auto k = support::cpp14::make_unique<CLTransposeKernel>();
+ auto k = std::make_unique<CLTransposeKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
@@ -154,12 +154,12 @@ Status CLFullyConnectedLayerReshapeWeightsEx::validate(const ITensorInfo *input,
CLFullyConnectedLayerEx::CLFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager,
IWeightsManager *weights_manager)
- : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(),
- _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(),
- _reshape_weights_function(), _mm_gemm(memory_manager, weights_manager),
- _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(),
- _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true),
- _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
+ : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(),
+ _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(),
+ _reshape_weights_function(), _mm_gemm(memory_manager, weights_manager),
+ _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(),
+ _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true),
+ _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
{
}
void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTensor *weights,
@@ -190,9 +190,9 @@ void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTens
const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
input->info()->set_quantization_info(QuantizationInfo(
- input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+ input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
weights->info()->set_quantization_info(QuantizationInfo(
- weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+ weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
// Configure gemmlowp function
_mm_gemmlowp.configure(input, weights, bias, output, gemm_info);
@@ -214,8 +214,8 @@ void CLFullyConnectedLayerEx::configure_conv_fc(const ICLTensor *input, const IC
const FullyConnectedLayerInfo &fc_info)
{
ARM_COMPUTE_ERROR_ON(
- (weights->info()->dimension(1) !=
- (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+ (weights->info()->dimension(1) !=
+ (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
// If the fully connected layer is called after a convolution layer, the input tensor must be
// linearized
@@ -223,11 +223,11 @@ void CLFullyConnectedLayerEx::configure_conv_fc(const ICLTensor *input, const IC
// Initialize output tensor for flatten
TensorShape shape_flatten = compute_flatten_shape(input->info());
_flatten_output.allocator()->init(input->info()
- ->clone()
- ->set_is_resizable(true)
- .reset_padding()
- .set_tensor_shape(shape_flatten)
- .set_data_layout(DataLayout::NCHW));
+ ->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(shape_flatten)
+ .set_data_layout(DataLayout::NCHW));
// Configure flatten kernel
_memory_group.manage(&_flatten_output);
@@ -258,8 +258,8 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor
// Perform validate step
ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayerEx::validate(
- input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
- fc_info));
+ input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+ fc_info));
_are_weights_converted = true;
_are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
@@ -285,10 +285,10 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor
const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
if (is_batched_fc_layer)
{
- _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
- (std::equal(input->info()->tensor_shape().cbegin() + 3,
- input->info()->tensor_shape().cend(),
- output->info()->tensor_shape().cbegin() + 1));
+ _is_fc_after_conv =
+ (TensorShape::num_max_dimensions >= 4) &&
+ (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(),
+ output->info()->tensor_shape().cbegin() + 1));
}
else
{
@@ -302,7 +302,7 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor
{
_reshape_weights_managed_function.configure(weights);
weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(
- _weights_manager->acquire(weights, &_reshape_weights_managed_function));
+ _weights_manager->acquire(weights, &_reshape_weights_managed_function));
}
else
{
@@ -320,7 +320,7 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor
_convert_weights_managed.configure(weights_to_use, input->info()->tensor_shape(),
fc_info.weights_trained_layout);
weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(
- _weights_manager->acquire(weights, &_convert_weights_managed));
+ _weights_manager->acquire(weights, &_convert_weights_managed));
}
else
{
@@ -359,16 +359,16 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
bool is_fc_after_conv = true;
const ITensorInfo &flatten_input = TensorInfo(input->clone()
- ->set_is_resizable(true)
- .reset_padding()
- .set_tensor_shape(compute_flatten_shape(input))
- .set_data_layout(DataLayout::NCHW));
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(compute_flatten_shape(input))
+ .set_data_layout(DataLayout::NCHW));
const ITensorInfo &reshaped_weights =
- TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
- compute_transposed_shape(*weights)));
+ TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+ compute_transposed_shape(*weights)));
const ITensorInfo &converted_weights =
- weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
- : TensorInfo(*reshaped_weights.clone());
+ weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+ : TensorInfo(*reshaped_weights.clone());
// With the Fully Connected layer we can have 4 different cases:
// 1) Convolution layer -> Fully Connected layer without batches
@@ -396,7 +396,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
{
// Validate reshape weights kernel
ARM_COMPUTE_RETURN_ON_ERROR(
- CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights));
+ CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights));
weights_to_use = &reshaped_weights;
}
@@ -404,7 +404,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
{
// Validate convert weights kernel
ARM_COMPUTE_RETURN_ON_ERROR(CLConvertFullyConnectedWeights::validate(
- weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
+ weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
weights_to_use = &converted_weights;
}
@@ -412,8 +412,8 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
{
// Fully Connected layer after a Convolution Layer without batches
ARM_COMPUTE_RETURN_ERROR_ON(
- (weights_to_use->dimension(1) !=
- (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+ (weights_to_use->dimension(1) !=
+ (input->dimension(0) * input->dimension(1) * input->dimension(2))));
// Validate flatten kernel
ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input));
@@ -427,7 +427,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
// Validate matrix multiply kernel
ARM_COMPUTE_RETURN_ON_ERROR(
- validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info));
+ validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info));
return Status{};
}
@@ -457,7 +457,7 @@ void CLFullyConnectedLayerEx::run()
if (_weights_manager && _weights_manager->are_weights_managed(cur_weights))
{
_original_weights = utils::cast::polymorphic_downcast<ICLTensor *>(
- _weights_manager->run(cur_weights, &_reshape_weights_managed_function));
+ _weights_manager->run(cur_weights, &_reshape_weights_managed_function));
}
else
{
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
index 157b4d977..cda784541 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
@@ -19,6 +19,7 @@
#include <arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h>
#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h>
+#include "src/core/helpers/AutoConfiguration.h"
using namespace arm_compute;
@@ -41,7 +42,7 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
// reshape
auto_init_if_empty(*_cl_buffer.info(),
_input->info()->clone()->set_tensor_shape(reshape).set_data_layout(
- _input->info()->data_layout()));
+ _input->info()->data_layout()));
_cl_reshape.configure(_input, &_cl_buffer);
input_to_use = &_cl_buffer;
}
@@ -57,7 +58,7 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
{
bool is_hybrid = (input->info()->data_type() == DataType::F32 ||
input->info()->data_type() == DataType::F16) &&
- (weights->info()->data_type() == DataType::S8 ||
+ (weights->info()->data_type() == DataType::QSYMM8 ||
weights->info()->data_type() == DataType::QASYMM8_SIGNED);
if (is_hybrid)
@@ -81,7 +82,6 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
{
throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type");
}
-
}();
if (_needs_reshape)
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp
new file mode 100644
index 000000000..cd7409417
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "support/StringSupport.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(accum);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() != 1);
+
+ return Status{};
+}
+
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *accum, ITensorInfo *biases, GPUTarget gpu_target,
+ unsigned int &num_elems_processed_per_iteration)
+{
+ // Select the vector size to use (8 for Bifrost; 16 for Midgard).
+ bool is_gpu_bifrost =
+ gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, GPUTarget::G51,
+ GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G52, GPUTarget::G52LIT);
+ num_elems_processed_per_iteration = is_gpu_bifrost ? 8 : 16;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowStatic biases_access(
+ biases, 0, 0, ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration),
+ biases->dimension(1));
+ AccessWindowHorizontal accum_access(accum, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, biases_access, accum_access);
+
+ Status err = (window_changed)
+ ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+ : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel()
+ : _accum(nullptr), _biases(nullptr)
+{
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), accum, biases);
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::configure(const CLCompileContext &compile_context,
+ ICLTensor *accum, const ICLTensor *biases)
+{
+ ARM_COMPUTE_UNUSED(compile_context);
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info()));
+
+ _biases = biases;
+ _accum = accum;
+
+ // Get the target gpu
+ GPUTarget gpu_target = get_target();
+ unsigned int vector_size = 0;
+
+ // Configure kernel window
+ auto win_config =
+ validate_and_configure_window(accum->info(), biases->info(), gpu_target, vector_size);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure_internal(win_config.second);
+
+ // Add build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(accum->info()->data_type()));
+ build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(
+ CLKernelLibraryEx::get().create_kernel("gemm_accumulate_biases", build_opts.options()));
+}
+
+Status CLGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum,
+ const ITensorInfo *biases, GPUTarget gpu_target)
+{
+ unsigned int num_elems_processed_per_iteration = 0;
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(accum->clone().get(),
+ biases->clone().get(), gpu_target,
+ num_elems_processed_per_iteration)
+ .first);
+
+ return Status{};
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window accum_slice = window.first_slice_window_2D();
+
+ Window biases_slice(accum_slice);
+ biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ // Run kernel
+ do
+ {
+ // Set arguments
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _accum, accum_slice);
+ add_1D_tensor_argument(idx, _biases, biases_slice);
+
+ enqueue(queue, *this, accum_slice, lws_hint());
+ } while (window.slide_window_slice_2D(accum_slice));
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
index e0b833b04..f380e3e2c 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
@@ -41,6 +41,8 @@
#include "arm_compute/runtime/CL/functions/CLGatherEx.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/CL/kernels/CLGatherKernel.h"
+
#include "arm_compute/core/CL/kernels/CLGatherExKernel.h"
using namespace arm_compute;
@@ -48,7 +50,7 @@ using namespace arm_compute;
void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output,
int axis)
{
- auto k = support::cpp14::make_unique<CLGatherExKernel>();
+ auto k = std::make_unique<CLGatherExKernel>();
k->configure(input, indices, output, axis);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
index 65b89a389..9896abd4b 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
@@ -47,7 +47,7 @@ using namespace arm_compute;
void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
{
- auto k = support::cpp14::make_unique<CLHashtableLookupKernel>();
+ auto k = std::make_unique<CLHashtableLookupKernel>();
k->configure(lookups, keys, input, output, hits);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
index 5a7e40839..ca45a57f8 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
@@ -50,7 +50,7 @@ CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {}
void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
ICLTensor *gamma, ICLTensor *beta, float epsilon)
{
- auto k = support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
+ auto k = std::make_unique<CLInstanceNormalizationLayerKernelEx>();
k->configure(input, output, gamma, beta, epsilon);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
index 28e5bc0da..2bdc451b3 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
@@ -46,7 +46,7 @@ using namespace arm_compute;
void CLNeg::configure(ICLTensor *input, ICLTensor *output)
{
- auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>();
+ auto k = std::make_unique<CLNegKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp
new file mode 100644
index 000000000..759a19ff3
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLOneHot.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLOneHot::CLOneHot() : _memset_kernel(), _onehot_kernel(), _has_to_memset(false) {}
+void CLOneHot::configure(const ICLTensor *indices, const ICLTensor *on_value,
+ const ICLTensor *off_value, ICLTensor *output, int depth, int axis)
+{
+ _onehot_kernel.configure(indices, on_value, off_value, output, depth, axis);
+}
+void CLOneHot::configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output,
+ PixelValue off_value, int depth, int axis)
+{
+ _has_to_memset = true;
+ _memset_kernel.configure(output, off_value);
+ _onehot_kernel.configure(indices, on_value, output, depth, axis);
+}
+Status CLOneHot::validate(const ITensorInfo *indices, const ITensorInfo *on_value,
+ const ITensorInfo *off_value, const ITensorInfo *output, int depth,
+ int axis)
+{
+ return CLOneHotKernel::validate(indices, on_value, off_value, output, depth, axis);
+}
+void CLOneHot::run()
+{
+ if (_has_to_memset)
+ {
+ CLScheduler::get().enqueue(_memset_kernel, true);
+ }
+
+ CLScheduler::get().enqueue(_onehot_kernel, false);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp
new file mode 100644
index 000000000..4d940e966
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLPadLayerEx.h"
+#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h"
+
+namespace arm_compute
+{
+CLPadLayerEx::CLPadLayerEx()
+ : _pad_kernel(std::make_unique<CLPadLayerKernelEx>()),
+ _copy_kernel(std::make_unique<opencl::kernels::ClCopyKernel>()), _perform_pad(false)
+{
+}
+
+void CLPadLayerEx::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding,
+ PixelValue constant_value, PaddingMode mode)
+{
+ configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value,
+ mode);
+}
+
+void CLPadLayerEx::configure(const CLCompileContext &compile_context, ICLTensor *input,
+ ICLTensor *output, const PaddingList &padding,
+ PixelValue constant_value, PaddingMode mode)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate(input->info(), output->info(), padding, constant_value, mode));
+
+ _perform_pad = std::any_of(padding.begin(), padding.end(),
+ [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
+
+ if (_perform_pad)
+ {
+ _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode);
+ }
+ else
+ {
+ Window copy_window = Window();
+ copy_window.use_tensor_dimensions(output->info()->tensor_shape());
+ // Copy the input to the whole output if no padding is applied
+ _copy_kernel->configure(compile_context, input->info(), output->info(), &copy_window);
+ }
+}
+Status CLPadLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const PaddingList &padding, PixelValue constant_value,
+ PaddingMode mode)
+{
+ bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) {
+ return info.first > 0 || info.second > 0;
+ });
+
+ if (perform_pad)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CLPadLayerKernelEx::validate(input, output, padding, constant_value, mode));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClCopyKernel::validate(input, output));
+ }
+ return Status{};
+}
+void CLPadLayerEx::run()
+{
+ if (_perform_pad)
+ {
+ CLScheduler::get().enqueue(*_pad_kernel);
+ }
+ else
+ {
+ CLScheduler::get().enqueue(*_copy_kernel);
+ }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
index b198e7330..6740835a8 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -40,21 +40,20 @@
#include "arm_compute/runtime/CL/functions/CLReduceOperation.h"
-#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
#include "arm_compute/core/TensorShape.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
using namespace arm_compute;
CLReduceOperation::CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(),
- _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape()
+ : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(),
+ _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape()
{
}
Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output,
const std::set<uint32_t> &axis, bool keep_dims,
- const ReduceOperation &op)
+ const ReductionOperation &op)
{
const size_t num_of_kernels = axis.size();
const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
@@ -62,7 +61,7 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *
ARM_COMPUTE_RETURN_ERROR_ON(num_of_kernels < 1);
// Create temporary tensor infos
- auto interm_tensors = support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+ auto interm_tensors = std::make_unique<TensorInfo[]>(num_of_interm_tensors);
// Create intermediate tensor info
TensorShape shape{input->tensor_shape()};
@@ -92,13 +91,13 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *
for (size_t i = 0; i < num_of_kernels; ++i, ++it)
{
ARM_COMPUTE_RETURN_ON_ERROR(
- CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op));
+ CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op));
}
if (!keep_dims)
{
ARM_COMPUTE_RETURN_ON_ERROR(
- CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output));
+ CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output));
}
return Status{};
@@ -106,7 +105,7 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *
void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
const std::set<uint32_t> &axis, bool keep_dims,
- ReduceOperation op)
+ ReductionOperation op)
{
ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, keep_dims, op));
@@ -125,8 +124,8 @@ void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
throw std::runtime_error("CLReduceOperation: there is no axis to reduce");
}
- _interm_tensors = support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
- _reduce_kernels = support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+ _interm_tensors = std::make_unique<CLTensor[]>(num_of_interm_tensors);
+ _reduce_kernels = std::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
// Set a vector that is ordered ICLTensors sequentially.
std::vector<ICLTensor *> tensors;
@@ -137,7 +136,7 @@ void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
}
tensors.emplace_back(output);
- // Apply ReduceOperation on all kernels
+ // Apply ReductionOperation on all kernels
TensorShape shape{input->info()->tensor_shape()};
auto it = axis.begin();
for (size_t i = 0; i < num_of_kernels; ++i, ++it)
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
new file mode 100644
index 000000000..bca4d5cb6
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSplitVEx.h"
+#include "support/ToolchainSupport.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include <cassert>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ICLTensor *size_splits, const std::vector<ICLTensor *> &outputs,
+ unsigned int num_splits)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(size_splits->info()->num_dimensions() != 1,
+ "size_splits must be a 1-D tensor.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_splits != outputs.size(),
+ "Number of output tensors does not match number of splits.");
+ return Status{};
+}
+
+Status validate_slices(const ITensorInfo *input, const std::vector<ITensorInfo *> &outputs,
+ uint32_t split_dim)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON(split_dim >= input->num_dimensions());
+ ARM_COMPUTE_RETURN_ERROR_ON(outputs.size() < 2);
+
+ // Start/End coordinates
+ Coordinates start_coords;
+ Coordinates end_coords;
+ for (unsigned int d = 0; d < input->num_dimensions(); ++d)
+ {
+ end_coords.set(d, -1);
+ }
+ unsigned int axis_offset = 0;
+ // Validate output tensors
+ for (const auto &output : outputs)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+ // Get output shape
+ const TensorShape output_shape = output->tensor_shape();
+ ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() == 0);
+
+ const size_t axis_split_step = output_shape[split_dim];
+
+ // Output auto inizialitation if not yet initialized
+ TensorInfo tmp_output_info = *output->clone();
+ auto_init_if_empty(tmp_output_info,
+ input->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
+
+ // Update coordinate on axis
+ start_coords.set(split_dim, axis_offset);
+ end_coords.set(split_dim, axis_offset + axis_split_step);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(input, output, start_coords, end_coords));
+
+ axis_offset += axis_split_step;
+ }
+
+ return Status{};
+}
+
+void configure_slices(const ICLTensor *input, const std::vector<ICLTensor *> &outputs,
+ std::vector<CLSlice> &_slice_functions, uint32_t split_dim)
+{
+ unsigned int axis_offset = 0;
+ // Start/End coordinates
+ Coordinates start_coords;
+ Coordinates end_coords;
+ for (unsigned int d = 0; d < input->info()->num_dimensions(); ++d)
+ {
+ end_coords.set(d, -1);
+ }
+ int out_iter = 0;
+ for (const auto &output : outputs)
+ {
+ const TensorShape output_shape = output->info()->tensor_shape();
+ auto op_size = output_shape.total_size();
+ if (!op_size)
+ {
+ continue;
+ }
+
+ assert(op_size != 0);
+ assert(split_dim <= output_shape.num_dimensions());
+
+ const size_t axis_split_step = output_shape[split_dim];
+
+ // Output auto inizialitation if not yet initialized
+ TensorInfo tmp_output_info = *output->info()->clone();
+ auto_init_if_empty(
+ tmp_output_info,
+ input->info()->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
+
+ // Update coordinate on axis
+ start_coords.set(split_dim, axis_offset);
+ end_coords.set(split_dim, axis_offset + axis_split_step);
+
+ // Configure slice function
+ _slice_functions[out_iter].configure(input, output, start_coords, end_coords);
+
+ // Set valid region from shape
+ outputs[out_iter++]->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
+ axis_offset += axis_split_step;
+ }
+}
+
+} // namespace
+
+CLSplitVEx::CLSplitVEx()
+ : _input(nullptr), _size_splits(nullptr), _outputs(), _num_splits(0), _slice_functions()
+{
+}
+
+void CLSplitVEx::configure(const ICLTensor *input, const ICLTensor *size_splits, uint32_t split_dim,
+ const std::vector<ICLTensor *> &outputs, unsigned int num_splits)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, size_splits);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(size_splits, outputs, num_splits));
+
+ _input = input;
+ _size_splits = size_splits;
+ _outputs = outputs;
+ _num_splits = num_splits;
+
+ // Create tensor slices
+ _slice_functions.resize(_num_splits);
+
+ // Extract output tensor info
+ std::vector<ITensorInfo *> outputs_info;
+ for (auto &&output : _outputs)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ outputs_info.emplace_back(output->info());
+ }
+
+ // Validate slices
+ ARM_COMPUTE_ERROR_THROW_ON(validate_slices(_input->info(), outputs_info, split_dim));
+
+ // Configure slices
+ configure_slices(_input, _outputs, _slice_functions, split_dim);
+}
+
+void CLSplitVEx::run()
+{
+ // execute the slices
+ for (unsigned i = 0; i < _outputs.size(); ++i)
+ {
+ _slice_functions[i].run();
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
index 3ac95a8e6..accd51302 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
@@ -49,14 +49,14 @@ namespace arm_compute
{
CLTopKV2::CLTopKV2()
- : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0),
- _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(),
- _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(),
- _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr),
- _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(),
- _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(),
- _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(),
- _reorder_negatives_kernel(), _store_kernel()*/
+ : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0),
+ _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(),
+ _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(),
+ _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr), _p_out_key_buf(nullptr),
+ _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(),
+ _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(),
+ _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(),
+ _reorder_negatives_kernel(), _store_kernel()*/
{
}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
index 3215d01a7..f3f093c18 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
@@ -53,7 +53,7 @@ using namespace arm_compute;
using namespace arm_compute::misc::shape_calculator;
CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_manager(std::move(memory_manager)), _function()
+ : _memory_manager(std::move(memory_manager)), _function()
{
}
@@ -79,7 +79,7 @@ void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, IC
{
case DeconvolutionMethod::DIRECT:
{
- auto f = arm_compute::support::cpp14::make_unique<CLDirectTransposeConvLayer>();
+ auto f = std::make_unique<CLDirectTransposeConvLayer>();
f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right,
invalid_bottom, weights_info);
_function = std::move(f);
@@ -87,7 +87,7 @@ void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, IC
}
case DeconvolutionMethod::GEMM:
{
- auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
+ auto f = std::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
f->configure(compile_context, input, weights, bias, output, deconv_info);
_function = std::move(f);
break;
@@ -105,20 +105,20 @@ Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
switch (CLTransposeConvLayer::get_deconvolution_method(
- input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info))
+ input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info))
{
case DeconvolutionMethod::DIRECT:
{
// Validate direct convolution layer
ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate(
- input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info));
+ input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info));
break;
}
case DeconvolutionMethod::GEMM:
{
// Validate gemm-based convolution layer
ARM_COMPUTE_RETURN_ON_ERROR(
- CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
+ CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
break;
}
default:
@@ -130,9 +130,9 @@ Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
}
DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method(
- const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias,
- ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right,
- unsigned int invalid_bottom, const WeightsInfo &weights_info)
+ const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias,
+ ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right,
+ unsigned int invalid_bottom, const WeightsInfo &weights_info)
{
ARM_COMPUTE_UNUSED(output, bias, weights_info);
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
deleted file mode 100644
index 2fc94b267..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h"
-#include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h>
-
-#include "arm_compute/core/ITensor.h"
-#include "support/MemorySupport.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-
-template <BinaryLogicalOperation COP>
-void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2,
- ITensor *output)
-{
- auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
- k->configure(COP, input1, input2, output);
- _kernel = std::move(k);
-}
-
-template <BinaryLogicalOperation COP>
-Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1,
- const ITensorInfo *input2,
- const ITensorInfo *output)
-{
- return NEBinaryLogicalOperationKernel::validate(COP, input1, input2, output);
-}
-
-void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output,
- BinaryLogicalOperation op)
-{
- auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
- k->configure(op, input1, input2, output);
- _kernel = std::move(k);
-}
-
-Status NEBinaryLogicalOperation::validate(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *output, BinaryLogicalOperation op)
-{
- return NEBinaryLogicalOperationKernel::validate(op, input1, input2, output);
-}
-
-// Supported Specializations
-template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>;
-template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>;
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp
new file mode 100644
index 000000000..f6eec2603
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2016-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NECastBool.h"
+
+#include "arm_compute/core/NEON/kernels/NECastBoolKernel.h"
+
+using namespace arm_compute;
+
+void NECastBool::configure(const ITensor *input, ITensor *output)
+{
+ auto k = std::make_unique<NECastBoolKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
+
+Status NECastBool::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return NECastBoolKernel::validate(input, output);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
index e0ab3e025..99fc5c579 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
@@ -41,13 +41,12 @@
#include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h"
#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
-#include "support/MemorySupport.h"
using namespace arm_compute;
void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups)
{
- auto k = support::cpp14::make_unique<NEEmbeddingLookupKernel>();
+ auto k = std::make_unique<NEEmbeddingLookupKernel>();
k->configure(input, output, lookups);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
index a123439d9..fbd88fff0 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
@@ -58,7 +58,7 @@ namespace
Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
{
ARM_COMPUTE_RETURN_ON_ERROR(
- NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
+ NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
return Status{};
}
@@ -66,7 +66,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
{
- auto k = support::cpp14::make_unique<NETransposeKernel>();
+ auto k = std::make_unique<NETransposeKernel>();
k->configure(input, output);
_kernel = std::move(k);
}
@@ -78,11 +78,11 @@ Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *in
}
NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer(
- std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(),
- _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(),
- _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false),
- _accumulate_biases(false), _is_prepared(false)
+ std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(),
+ _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(),
+ _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false),
+ _accumulate_biases(false), _is_prepared(false)
{
}
@@ -103,8 +103,8 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
// Perform validate step
ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedHybridLayer::validate(
- input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
- fc_info));
+ input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+ fc_info));
_are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
_accumulate_biases = false;
@@ -132,10 +132,10 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
bool _is_fc_after_conv;
if (is_batched_fc_layer)
{
- _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
- (std::equal(input->info()->tensor_shape().cbegin() + 3,
- input->info()->tensor_shape().cend(),
- output->info()->tensor_shape().cbegin() + 1));
+ _is_fc_after_conv =
+ (TensorShape::num_max_dimensions >= 4) &&
+ (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(),
+ output->info()->tensor_shape().cbegin() + 1));
}
else
{
@@ -150,23 +150,23 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
{
// Reshape the weights
_reshape_weights_output.allocator()->init(
- weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
- compute_transposed_shape(*weights->info())));
+ weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+ compute_transposed_shape(*weights->info())));
_reshape_weights_function.configure(weights_to_use, &_reshape_weights_output);
weights_to_use = &_reshape_weights_output;
}
// Quantize input
_quantized_input.allocator()->init(
- input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
- DataType::QASYMM8_SIGNED));
+ input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+ DataType::QASYMM8_SIGNED));
_scale_factor.allocator()->init(
- TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32));
+ TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32));
_quant_input_kernel.configure(input, &_quantized_input, &_scale_factor);
// GEMM
_gemmlowp_output.allocator()->init(
- output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+ output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output);
// Multiply scale
@@ -195,8 +195,8 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
const ITensorInfo &reshaped_weights =
- TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
- compute_transposed_shape(*weights)));
+ TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+ compute_transposed_shape(*weights)));
// Configure accumulate biases kernel for non quantized asymmetric types
if (biases != nullptr)
@@ -217,7 +217,7 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
{
// Validate reshape weights kernel
ARM_COMPUTE_RETURN_ON_ERROR(
- NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
+ NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
weights_to_use = &reshaped_weights;
}
@@ -225,20 +225,19 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
// Validate quantization kernel
- const ITensorInfo &quantized_input =
- TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
- DataType::QASYMM8_SIGNED));
+ const ITensorInfo &quantized_input = TensorInfo(
+ input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::QASYMM8_SIGNED));
const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32);
ARM_COMPUTE_RETURN_ON_ERROR(
- NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor));
+ NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor));
const ITensorInfo &gemmlowp_output = TensorInfo(
- output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+ output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
// Validate matrix multiply kernel
ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output));
ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate(
- &gemmlowp_output, &scale_factor, output, weights->quantization_info().uniform().scale));
+ &gemmlowp_output, &scale_factor, output, weights->quantization_info().uniform().scale));
return Status{};
}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
index cb7557a5a..758f7dc59 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
@@ -50,7 +50,8 @@
#include <algorithm>
#include <cmath>
-using namespace arm_compute;
+namespace arm_compute
+{
using namespace arm_compute::misc::shape_calculator;
namespace
@@ -69,14 +70,14 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
// Validate gemmlowp function
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(
- &input.clone()->set_quantization_info(input_quantization_info),
- &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output));
+ &input.clone()->set_quantization_info(input_quantization_info),
+ &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output));
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(
- &input, &weights, nullptr, &output, 1.f, 0.0f,
- GEMMInfo(false, false, false /* Reshape weights only for the first run */)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEGEMM::validate(&input, &weights, nullptr, &output, 1.f, 0.0f,
+ GEMMInfo(false, false, false /* Reshape weights only for the first run */)));
}
return Status{};
@@ -84,12 +85,12 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
} // namespace
NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(),
- _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(),
- _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(),
- _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr),
- _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false),
- _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
+ : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(),
+ _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(),
+ _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), _converted_weights_output(),
+ _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true),
+ _are_weights_reshaped(false), _is_fc_after_conv(false), _accumulate_biases(false),
+ _is_quantized(false), _is_prepared(false)
{
}
@@ -105,9 +106,9 @@ void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *
const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
input->info()->set_quantization_info(QuantizationInfo(
- input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+ input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
weights->info()->set_quantization_info(QuantizationInfo(
- weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+ weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
// Configure gemmlowp function
_mm_gemmlowp.configure(input, weights, nullptr, output);
@@ -129,8 +130,8 @@ void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITen
ITensor *output)
{
ARM_COMPUTE_ERROR_ON(
- (weights->info()->dimension(1) !=
- (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+ (weights->info()->dimension(1) !=
+ (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
// If the fully connected layer is called after a convolution layer, the input tensor must be
// linearized
@@ -138,8 +139,7 @@ void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITen
// Initialize output tensor for flatten
TensorShape shape_flatten = compute_flatten_shape(input->info());
_flatten_output.allocator()->init(
- input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
- shape_flatten));
+ input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
// Configure flatten kernel
_memory_group.manage(&_flatten_output);
@@ -165,12 +165,11 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei
const ITensor *biases, ITensor *output,
FullyConnectedLayerInfo fc_info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
// Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate(
- input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
- fc_info));
+ input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+ fc_info));
_are_weights_converted = true;
_are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
@@ -183,8 +182,7 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei
if (_is_quantized)
{
_gemmlowp_output.allocator()->init(
- output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
- DataType::S32));
+ output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
}
// Configure accumulate biases kernel for non quantized asymmetric types
@@ -208,10 +206,10 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei
const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
if (is_batched_fc_layer)
{
- _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
- (std::equal(input->info()->tensor_shape().cbegin() + 3,
- input->info()->tensor_shape().cend(),
- output->info()->tensor_shape().cbegin() + 1));
+ _is_fc_after_conv =
+ (TensorShape::num_max_dimensions >= 4) &&
+ (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(),
+ output->info()->tensor_shape().cbegin() + 1));
}
else
{
@@ -284,16 +282,16 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
const ITensorInfo &flatten_input =
- TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
- compute_flatten_shape(input)));
+ TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+ compute_flatten_shape(input)));
const ITensorInfo &reshaped_weights =
- TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
- compute_transposed_shape(*weights)));
+ TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+ compute_transposed_shape(*weights)));
const ITensorInfo &converted_weights =
- weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
- : TensorInfo(*reshaped_weights.clone());
+ weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+ : TensorInfo(*reshaped_weights.clone());
const ITensorInfo &gemmlowp_output = TensorInfo(
- output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+ output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
// Configure accumulate biases kernel for non quantized asymmetric types
if (biases != nullptr && !is_quantized)
@@ -330,7 +328,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
{
// Validate reshape weights kernel
ARM_COMPUTE_RETURN_ON_ERROR(
- NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
+ NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
weights_to_use = &reshaped_weights;
}
@@ -338,7 +336,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
{
// Validate convert weights kernel
ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(
- weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
+ weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
weights_to_use = &converted_weights;
}
@@ -346,11 +344,11 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
{
// Fully Connected layer after a Convolution Layer without batches
ARM_COMPUTE_RETURN_ERROR_ON(
- (weights_to_use->dimension(1) !=
- (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+ (weights_to_use->dimension(1) !=
+ (input->dimension(0) * input->dimension(1) * input->dimension(2))));
// Validate flatten kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayer::validate(input, &flatten_input));
input_to_use = &flatten_input;
}
else
@@ -365,7 +363,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
if (is_quantized)
{
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(
- &gemmlowp_output, biases, output));
+ &gemmlowp_output, biases, output));
}
return Status{};
@@ -376,9 +374,13 @@ void NEFullyConnectedLayerEx::run()
if (!_is_prepared)
{
if (!_are_weights_reshaped)
+ {
_reshape_weights_output.allocator()->allocate();
+ }
if (!_are_weights_converted)
+ {
_converted_weights_output.allocator()->allocate();
+ }
_is_prepared = true;
}
@@ -409,7 +411,7 @@ void NEFullyConnectedLayerEx::run()
// Linearize input if it comes from a convolutional layer
if (_is_fc_after_conv)
{
- NEScheduler::get().schedule(&_flatten_kernel, Window::DimY);
+ _flatten_kernel.run();
}
// Run matrix multiply
@@ -492,3 +494,4 @@ void NEFullyConnectedLayerEx::prepare()
}
#endif
}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
index dc6c78478..2199839fb 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
@@ -19,6 +19,8 @@
#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
#include <arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h>
#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h>
+#include "src/core/helpers/AutoConfiguration.h"
+#include <cassert>
using namespace arm_compute;
@@ -56,7 +58,7 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input
assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
bool is_hybrid = input->info()->data_type() == DataType::F32 &&
- (weights->info()->data_type() == DataType::S8 ||
+ (weights->info()->data_type() == DataType::QSYMM8 ||
weights->info()->data_type() == DataType::QASYMM8_SIGNED);
if (is_hybrid)
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
index 433c35d58..e5607ab9a 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
@@ -41,7 +41,6 @@
#include "arm_compute/runtime/NEON/functions/NEGatherEx.h"
#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
-#include "support/MemorySupport.h"
#include <utility>
@@ -49,7 +48,7 @@ namespace arm_compute
{
void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
{
- auto k = support::cpp14::make_unique<NEGatherKernelEx>();
+ auto k = std::make_unique<NEGatherKernelEx>();
k->configure(input, indices, output, axis);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
index 52d58accf..7cc6c89e7 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
@@ -41,14 +41,13 @@
#include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h"
#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
-#include "support/MemorySupport.h"
using namespace arm_compute;
void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input,
ITensor *output, ITensor *hits)
{
- auto k = support::cpp14::make_unique<NEHashtableLookupKernel>();
+ auto k = std::make_unique<NEHashtableLookupKernel>();
k->configure(lookups, keys, input, output, hits);
_kernel = std::move(k);
}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
index 16d74e62d..451aa0997 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
@@ -46,9 +46,9 @@
namespace arm_compute
{
NEInstanceNormalizationLayerEx::NEInstanceNormalizationLayerEx(
- std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false),
- _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
+ std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false),
+ _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
{
}
@@ -88,8 +88,8 @@ Status NEInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const
float epsilon)
{
return NEInstanceNormalizationLayerKernelEx::validate(
- &input->clone()->set_data_layout(DataLayout::NCHW),
- &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon);
+ &input->clone()->set_data_layout(DataLayout::NCHW),
+ &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon);
}
void NEInstanceNormalizationLayerEx::run()
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp
index 2752eb6aa..e0620bad2 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp
@@ -15,7 +15,7 @@
*/
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,30 +37,23 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+#include "arm_compute/runtime/NEON/functions/NEOneHot.h"
+#include "arm_compute/core/NEON/kernels/NEOneHotKernel.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayerEx.h"
-
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h"
-#include "arm_compute/runtime/IRuntimeContext.h"
-#include "support/MemorySupport.h"
-
+#include <utility>
namespace arm_compute
{
-NEActivationLayerEx::NEActivationLayerEx(IRuntimeContext *ctx) // NOLINT
- : INESimpleFunctionNoBorder(ctx)
+void NEOneHot::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
+ const ITensor *off_value, ITensor *output, int axis)
{
-}
-void NEActivationLayerEx::configure(ITensor *input, ITensor *output,
- ActivationLayerInfo activation_info)
-{
- auto k = support::cpp14::make_unique<NEActivationLayerKernelEx>();
- k->configure(input, output, activation_info);
+ auto k = std::make_unique<NEOneHotKernel>();
+ k->configure(indices, depth, on_value, off_value, output, axis);
_kernel = std::move(k);
}
-
-Status NEActivationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
- const ActivationLayerInfo &act_info)
+Status NEOneHot::validate(const ITensorInfo *indices, const ITensorInfo *depth,
+ const ITensorInfo *on_value, const ITensorInfo *off_value,
+ const ITensorInfo *output, int axis)
{
- return NEActivationLayerKernelEx::validate(input, output, act_info);
+ return NEOneHotKernel::validate(indices, depth, on_value, off_value, output, axis);
}
} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
index aedb537e9..a30c00ea1 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
@@ -40,22 +40,24 @@
#include "arm_compute/runtime/NEON/functions/NEReduceOperation.h"
-#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/Tensor.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
using namespace arm_compute;
NEReduceOperation::NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
- _reduction_ops(), _keep_dims()
+ : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+ _reduction_ops(), _keep_dims()
{
}
Status NEReduceOperation::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
- bool keep_dims, const ITensorInfo *output, ReduceOperation op)
+ bool keep_dims, const ITensorInfo *output, ReductionOperation op)
{
ARM_COMPUTE_UNUSED(keep_dims);
ARM_COMPUTE_UNUSED(op);
@@ -102,7 +104,7 @@ Status NEReduceOperation::validate(const ITensorInfo *input, const Coordinates &
}
void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
- ITensor *output, ReduceOperation op)
+ ITensor *output, ReductionOperation op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
@@ -125,7 +127,7 @@ void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_a
for (unsigned int i = 0; i < _reduction_ops; ++i)
{
TensorShape out_shape =
- i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+ i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
out_shape.set(axis_local[i], 1);
auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
index 26a887912..7a1342644 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
@@ -40,15 +40,19 @@
#include "arm_compute/runtime/NEON/functions/NEReduceSum.h"
-#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
using namespace arm_compute;
NEReduceSum::NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
- _reduction_ops(), _keep_dims()
+ : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+ _reduction_ops(), _keep_dims()
{
}
@@ -122,7 +126,7 @@ void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, b
for (unsigned int i = 0; i < _reduction_ops; ++i)
{
TensorShape out_shape =
- i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+ i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
out_shape.set(axis_local[i], 1);
auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
@@ -135,7 +139,7 @@ void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, b
_reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
input->info()->data_type(),
input->info()->quantization_info())
- .set_data_layout(input->info()->data_layout()));
+ .set_data_layout(input->info()->data_layout()));
_memory_group.manage(&_reduced_outs[i]);
_reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i],
ReductionOperation::SUM);
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp
deleted file mode 100644
index 2aa0d2d4b..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-namespace arm_compute
-{
-namespace
-{
-/** Define dimension to split the window
- *
- * @param[in] axis Reduction axis
- *
- * @return The dimension to split the window
- */
-size_t reduction_window_split_dimension(unsigned int axis)
-{
- switch (axis)
- {
- case 0:
- return Window::DimY;
- case 1:
- case 2:
- case 3:
- return Window::DimX;
- default:
- ARM_COMPUTE_ERROR("Unsupported reduction axis");
- }
-}
-} // namespace
-
-NEReductionOperationEx::NEReductionOperationEx()
- : _reduction_kernel(), _fill_border_kernel(), _window_split(0), _reduction_axis()
-{
-}
-
-Status NEReductionOperationEx::validate(const ITensorInfo *input, const ITensorInfo *output,
- unsigned int axis, ReduceOperation op)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernelEx::validate(input, output, axis, op));
-
- return Status{};
-}
-
-void NEReductionOperationEx::configure(ITensor *input, ITensor *output, unsigned int axis,
- ReduceOperation op)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(
- NEReductionOperationEx::validate(input->info(), output->info(), axis, op));
-
- // Configure reduction kernel
- _reduction_kernel.configure(input, output, axis, op);
- _window_split = reduction_window_split_dimension(axis);
- _reduction_axis = axis;
-
- if (axis == 0)
- {
- // Configure fill border kernel
- const BorderSize fill_border_size = _reduction_kernel.border_size();
- PixelValue pixelValue;
- switch (op)
- {
- case ReduceOperation::MIN:
- {
- switch (input->info()->data_type())
- {
- case DataType::F32:
- {
- pixelValue = PixelValue(std::numeric_limits<float>::max());
- break;
- }
- case DataType::F16:
- {
- pixelValue = PixelValue(static_cast<half>(65504.0f));
- break;
- }
- case DataType::QASYMM8:
- {
- pixelValue =
- PixelValue(255, input->info()->data_type(), input->info()->quantization_info());
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Unsupported DataType");
- }
- }
- break;
- }
- case ReduceOperation::MAX:
- {
- switch (input->info()->data_type())
- {
- case DataType::F32:
- {
- pixelValue = PixelValue(-std::numeric_limits<float>::max());
- break;
- }
- case DataType::F16:
- {
- pixelValue = PixelValue(static_cast<half>(-65504.0f));
- break;
- }
- case DataType::QASYMM8:
- {
- pixelValue =
- PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Unsupported DataType");
- }
- }
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Reduction Operation unsupported");
- }
- _fill_border_kernel.configure(input, fill_border_size, BorderMode::CONSTANT, pixelValue);
- }
-}
-
-void NEReductionOperationEx::run()
-{
- if (_reduction_axis == 0)
- {
- NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
- }
- NEScheduler::get().schedule(&_reduction_kernel, _window_split);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
index aa165cc15..4675121b2 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
@@ -44,6 +44,7 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
using namespace arm_compute::misc::shape_calculator;
@@ -51,17 +52,9 @@ namespace arm_compute
{
NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
- : _memory_group(std::move(memory_manager)),
- _conv_f(),
- _upsample_f(),
- _flip_weights(),
- _scaled_output(),
- _weights_flipped(),
- _flip_axis(),
- _original_weights(nullptr),
- _input(nullptr),
- _info(),
- _is_prepared(false)
+ : _memory_group(std::move(memory_manager)), _conv_f(), _upsample_f(), _flip_weights(),
+ _scaled_output(), _weights_flipped(), _flip_axis(), _original_weights(nullptr), _input(nullptr),
+ _info(), _is_prepared(false)
{
}
@@ -76,15 +69,15 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
const unsigned int width_idx =
- get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
const unsigned int height_idx =
- get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1);
auto out_dims = transposeconv_output_dimensions(
- input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx),
- weights->dimension(height_idx), info, invalid_right, invalid_bottom);
+ input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx),
+ weights->dimension(height_idx), info, invalid_right, invalid_bottom);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
if (bias != nullptr)
@@ -117,24 +110,24 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
unsigned int pad_right = 0;
unsigned int pad_top = 0;
unsigned int pad_bottom = 0;
- const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
- *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
- pad_bottom);
+ const TensorShape scale_out_shape =
+ compute_transposeconv_upsampled_shape(*input, *weights, info, out_dims, invalid_right,
+ invalid_bottom, pad_left, pad_right, pad_top, pad_bottom);
TensorInfo scale_out_info(
- input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
+ input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
const unsigned int batches_idx =
- get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
const unsigned int channel_idx =
- get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) !=
scale_out_info.dimension(batches_idx));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) !=
scale_out_info.dimension(channel_idx));
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output,
- conv_info, WeightsInfo()));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo()));
return Status{};
}
@@ -146,21 +139,21 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con
// Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
- input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(),
- info, invalid_right, invalid_bottom));
+ input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(),
+ info, invalid_right, invalid_bottom));
const DataLayout data_layout = input->info()->data_layout();
const unsigned int width_idx =
- get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const unsigned int height_idx =
- get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
auto out_dims = transposeconv_output_dimensions(
- input->info()->dimension(width_idx), input->info()->dimension(height_idx),
- weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info,
- invalid_right, invalid_bottom);
+ input->info()->dimension(width_idx), input->info()->dimension(height_idx),
+ weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info,
+ invalid_right, invalid_bottom);
const TensorShape output_shape =
- compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+ compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
_input = input;
_original_weights = weights;
@@ -188,8 +181,8 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
- *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
- pad_right, pad_top, pad_bottom);
+ *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+ pad_right, pad_top, pad_bottom);
const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
DimensionRoundingType::FLOOR);