Imported Upstream version 1.15.0upstream/1.15.0 submit/tizen/20210427.093759 submit/tizen/20210423.055448 submit/tizen/20210422.015846 submit/tizen/20210421.062230 accepted/tizen/unified/20210428.040443

author: Chunseok Lee <chunseok.lee@samsung.com> 2021-04-20 18:01:41 +0900
committer: Chunseok Lee <chunseok.lee@samsung.com> 2021-04-20 18:01:41 +0900
commit: 589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e (patch)
tree: 47a2b23ce4220e3a4150c8b12ed941555272fb0c /compute/ARMComputeEx
parent: 62529acabbafce7730601ed01d5709d7bc0d378a (diff)
download: nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.tar.gz
nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.tar.bz2
nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.zip
122 files changed, 13785 insertions, 166 deletions
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
index 4a3717885..d3e116381 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
@@ -264,5 +264,5 @@ private:
     _program_source_map; /**< Contains sources for all programs.
                            Used for compile-time kernel inclusion. >*/
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
index a0aa0560b..46d4ae858 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
@@ -40,7 +40,7 @@
 #ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H
 #define ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
index bb6fcb8f5..eac866b67 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
@@ -41,8 +41,8 @@
 #ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
 #define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/TypesEx.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h
index ed668fd9c..cf671102e 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLCASTBOOLKERNEL_H__
 #define __ARM_COMPUTE_CLCASTBOOLKERNEL_H__
 
-#include "arm_compute/core/CL/ICLSimple3DKernel.h"
+#include "src/core/CL/ICLSimple3DKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
index fb689f747..6729fb0f1 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
 #define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
new file mode 100644
index 000000000..64908ab59
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H
+#define ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+/** Interface to add a bias to each row of the input tensor
+ *
+ */
+class CLGEMMMatrixAccumulateBiasesKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLGEMMMatrixAccumulateBiasesKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLGEMMMatrixAccumulateBiasesKernel(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLGEMMMatrixAccumulateBiasesKernel &
+  operator=(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLGEMMMatrixAccumulateBiasesKernel(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLGEMMMatrixAccumulateBiasesKernel &operator=(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
+  /** Set the accumulate buffer and the biases of the kernel.
+   *
+   * @param[in, out] accum  The accumulate tensor to convert. Data types supported: F16/F32
+   * @param[in]      biases The shared biases tensor to append. It must be 1D tensor. Data types
+   * supported: Same as @p input
+   */
+  void configure(ICLTensor *accum, const ICLTensor *biases);
+  /** Set the accumulate buffer and the biases of the kernel.
+   *
+   * @param[in]      compile_context The compile context to be used.
+   * @param[in, out] accum           The accumulate tensor to convert. Data types supported: F16/F32
+   * @param[in]      biases          The shared biases tensor to append. It must be 1D tensor. Data
+   * types supported: Same as @p input
+   */
+  void configure(const CLCompileContext &compile_context, ICLTensor *accum,
+                 const ICLTensor *biases);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLGEMMMatrixAccumulateBiasesKernel
+   *
+   * @param[in] accum      The accumulate tensor to convert. Data types supported: F16/F32
+   * @param[in] biases     The shared biases tensor to append. It must be 1D tensor. Data types
+   * supported: Same as @p input
+   * @param[in] gpu_target GPU target
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_accum;
+  const ICLTensor *_biases;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
index 6630c7be7..a55f2401d 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLGATHEREXKERNEL_H__
 #define __ARM_COMPUTE_CLGATHEREXKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
index 96f830898..f9d6f7cc5 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
 #define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
index f57e799ad..7da9e9a4c 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__
 #define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h
new file mode 100644
index 000000000..4befdd05c
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLMEMSETKERNEL_H
+#define ARM_COMPUTE_CLMEMSETKERNEL_H
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for filling the planes of a tensor */
+class CLMemsetKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLMemsetKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLMemsetKernel(const CLMemsetKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLMemsetKernel &operator=(const CLMemsetKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLMemsetKernel(CLMemsetKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLMemsetKernel &operator=(CLMemsetKernel &&) = default;
+  /** Default destructor */
+  ~CLMemsetKernel() = default;
+
+  /** Initialise the kernel's tensor and filling value
+   *
+   * @param[in,out] tensor         Input tensor to fill. Supported data types: All.
+   * @param[in]     constant_value The value used to fill the planes of the tensor
+   * @param[in]     window         Window to be used in case setting only part of a tensor. Default
+   * is nullptr.
+   */
+  void configure(ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr);
+  /** Initialise the kernel's tensor and filling value
+   *
+   * @param[in]     compile_context The compile context to be used.
+   * @param[in,out] tensor          Input tensor to fill. Supported data types: All.
+   * @param[in]     constant_value  The value used to fill the planes of the tensor
+   * @param[in]     window          Window to be used in case setting only part of a tensor. Default
+   * is nullptr.
+   */
+  void configure(const CLCompileContext &compile_context, ICLTensor *tensor,
+                 const PixelValue &constant_value, Window *window = nullptr);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLMemsetKernel
+   *
+   * @param[in] tensor         Source tensor info. Data types supported: All.
+   * @param[in] constant_value The value used to fill the planes of the tensor
+   * @param[in] window         Window to be used in case setting only part of a tensor. Default is
+   * nullptr.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value,
+                         Window *window = nullptr);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_tensor;
+  Window _full_window;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLMEMSETRKERNEL_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h
index 90e8b5705..5394a062c 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__
 #define __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
index fa383c0d0..384050aff 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_CLNEGKERNEL_H__
 #define __ARM_COMPUTE_CLNEGKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h
index a512057b9..1d64f9f7d 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h
@@ -39,7 +39,7 @@
  */
 #ifndef __ARM_COMPUTE_CLONEHOTKERNEL_H__
 #define __ARM_COMPUTE_CLONEHOTKERNEL_H__
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h
new file mode 100644
index 000000000..d4230aaf3
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLPADLAYERKERNELEX_H
+#define ARM_COMPUTE_CLPADLAYERKERNELEX_H
+
+#include "src/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the PadLayer function. */
+class CLPadLayerKernelEx : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLPadLayerKernelEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPadLayerKernelEx(const CLPadLayerKernelEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPadLayerKernelEx &operator=(const CLPadLayerKernelEx &) = delete;
+  /** Allow instances of this class to be moved */
+  CLPadLayerKernelEx(CLPadLayerKernelEx &&) = default;
+  /** Allow instances of this class to be moved */
+  CLPadLayerKernelEx &operator=(CLPadLayerKernelEx &&) = default;
+  /** Default destructor */
+  ~CLPadLayerKernelEx() = default;
+  /** Set the input and output tensor.
+   *
+   * @param[in]  input          Source tensor. Data types supported: U8, S8, QASYMM8,
+   * QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32.
+   * @param[out] output         Output tensor. Data type supported: same as @p input
+   * @param[in]  padding        The padding for each spatial dimension of the input tensor. The pair
+   * padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in]  constant_value (Optional) Constant value to be used for the padding.
+   * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding,
+                 PixelValue constant_value = PixelValue(),
+                 PaddingMode mode = PaddingMode::CONSTANT);
+  /** Set the input and output tensor.
+   *
+   * @param[in]  compile_context The compile context to be used.
+   * @param[in]  input           Source tensor. Data types supported: All.
+   * @param[out] output          Output tensor. Data type supported: same as @p input
+   * @param[in]  padding         The padding for each spatial dimension of the input tensor. The
+   * pair padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in]  constant_value  (Optional) Constant value to be used for the padding.
+   * @param[in]  mode            (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
+                 const PaddingList &padding, PixelValue constant_value = PixelValue(),
+                 PaddingMode mode = PaddingMode::CONSTANT);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLPadLayerKernelEx
+   *
+   * @param[in] input          Source tensor info. Data types supported: U8, S8, QASYMM8,
+   * QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32.
+   * @param[in] output         Output tensor info. Data type supported: same as @p input
+   * @param[in] padding        The padding for each spatial dimension of the input tensor. The pair
+   * padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in] constant_value (Optional) Constant value to be used for the padding.
+   * @param[in] mode           (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const PaddingList &padding, PixelValue constant_value = PixelValue(),
+                         PaddingMode mode = PaddingMode::CONSTANT);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+  int _input_start_x;
+  int _input_start_y;
+  bool _4d_enabled;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLPADLAYERKERNELEX_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h
index 4e1b56cba..3f60db7bb 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__
 #define __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
index 4f9042e41..548f29a27 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
 #define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h
index 4d4478ece..5f5b7f9b8 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__
 #define __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
index aa4a14812..09073af7c 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__
 #define __ARM_COMPUTE_CLTOPKV2KERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 // these parameters can be changed
 #define _ITEMS 16                          // number of items in a group
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
index 8c544cda8..c46b26170 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
@@ -41,15 +41,19 @@
 #ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__
 #define __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__
 
-#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
 #include "arm_compute/core/TypesEx.h"
 
+#include "src/core/cpu/kernels/CpuElementwiseKernel.h"
+
 namespace arm_compute
 {
 
-class NEBinaryLogicalOperationKernel : public NEElementwiseOperationKernel
+class NEBinaryLogicalOperationKernel : public cpu::kernels::CpuComparisonKernel
 {
 public:
+  const char *name() const override { return "NEBinaryLogicalOperationKernel"; }
+
+  NEBinaryLogicalOperationKernel() = default;
   /** Default destructor */
   ~NEBinaryLogicalOperationKernel() = default;
 
@@ -81,6 +85,10 @@ protected:
   // Inherited methods overridden:
   static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
                                    const ITensorInfo &output);
+
+  std::function<void(const ITensor *input1, const ITensor *input2, ITensor *output,
+                     const Window &window)>
+    _function;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h
index 101f6ac8e..036d56e69 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h
@@ -40,7 +40,7 @@
 #ifndef __ARM_COMPUTE_NECASTBOOLKERNEL_H__
 #define __ARM_COMPUTE_NECASTBOOLKERNEL_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
index 88f21c96e..621500eb8 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__
 #define __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
new file mode 100644
index 000000000..f8f7ac567
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H
+#define ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+/** NEON kernel to add a bias to each row of the input tensor */
+class NEGEMMMatrixAccumulateBiasesKernel : public INEKernel
+{
+public:
+  const char *name() const override { return "NEGEMMMatrixAccumulateBiasesKernel"; }
+  /** Default constructor */
+  NEGEMMMatrixAccumulateBiasesKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEGEMMMatrixAccumulateBiasesKernel(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEGEMMMatrixAccumulateBiasesKernel &
+  operator=(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  NEGEMMMatrixAccumulateBiasesKernel(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  NEGEMMMatrixAccumulateBiasesKernel &operator=(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
+  /** Default destructor */
+  ~NEGEMMMatrixAccumulateBiasesKernel() = default;
+  /** Set the accumulate buffer and the biases of the kernel.
+   *
+   * @param[in, out] accum  The accumulate tensor to convert. Data type supported: F32
+   * @param[in]      biases The shared biases tensor to append. It must be 1D Tensor. Data type
+   * supported: Same as @p input
+   */
+  void configure(ITensor *accum, const ITensor *biases);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEGEMMMatrixAccumulateBiasesKernel
+   *
+   * @param[in] accum  The accumulate tensor to convert. Data type supported: F32
+   * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type
+   * supported: Same as @p input
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *accum, const ITensorInfo *biases);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  ITensor *_accum;
+  const ITensor *_biases;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
index 5acfde5a8..a03e08ade 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEGATHERKERNELEX_H__
 #define __ARM_COMPUTE_NEGATHERKERNELEX_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
index cb2a485d5..fb3a72725 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__
 #define __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
index 8724cc69b..1d786b59e 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__
 #define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
index 198b0be9d..ab534fe96 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__
 #define __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
index 963d7b821..c1c9f7a3c 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
@@ -39,7 +39,7 @@
  */
 #ifndef __ARM_COMPUTE_NEONEHOTKERNEL_H__
 #define __ARM_COMPUTE_NEONEHOTKERNEL_H__
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
index 0b080cf73..1fd5362ae 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__
 #define __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
index d57e8fcf5..d7ec1b4f0 100644
--- a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
@@ -67,5 +67,5 @@ transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height,
                                 unsigned int kernel_width, unsigned int kernel_height,
                                 const PadStrideInfo &info, unsigned int invalid_right,
                                 unsigned int invalid_top);
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_UTILSEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
index 484ebfd0b..664b8b3b1 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
@@ -26,6 +26,7 @@
 #include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h>
 #include <arm_compute/runtime/CL/functions/CLNeg.h>
 #include <arm_compute/runtime/CL/functions/CLOneHot.h>
+#include <arm_compute/runtime/CL/functions/CLPadLayerEx.h>
 #include <arm_compute/runtime/CL/functions/CLReduceOperation.h>
 #include <arm_compute/runtime/CL/functions/CLSplitVEx.h>
 #include <arm_compute/runtime/CL/functions/CLTopKV2.h>
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
index b1ee52bf9..05bcc4075 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
@@ -41,8 +41,9 @@
 #define __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__
 
 #include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
@@ -100,7 +101,7 @@ private:
   std::vector<CLTensor> _results_vector;
   CLTensor _not_reshaped_output;
   std::vector<CLArgMinMaxLayerKernelEx> _reduction_kernels_vector;
-  CLReshapeLayerKernel _reshape_kernel;
+  CLReshapeLayer _reshape_kernel;
   unsigned int _num_of_stages;
   unsigned int _reduction_axis;
 };
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
index 88a9b00ec..fc4322798 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
@@ -43,6 +43,7 @@
 
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/core/TypesEx.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h
index d6150684a..854ddce52 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h
@@ -67,5 +67,5 @@ public:
    */
   void configure(ICLTensor *input, ICLTensor *output);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLCASTBOOL_H */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
index fbee7e40e..b0149cb09 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
@@ -73,5 +73,5 @@ public:
    */
   void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups);
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
index f3266f688..c75ae9a50 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
@@ -43,14 +43,14 @@
 
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h"
 #include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h"
 #include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h"
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
-#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
 
 namespace arm_compute
 {
@@ -182,5 +182,5 @@ private:
   bool _is_prepared;
   const ICLTensor *_original_weights;
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
index f27e9913e..c08da526a 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
@@ -43,16 +43,14 @@
 
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
 #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
 
 namespace arm_compute
 {
@@ -132,9 +130,6 @@ private:
  * transpose_weights is set to true ) (called once)
  *  -# @ref CLGEMMMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized
  * asymmetric)
- *  -# @ref CLGEMMMatrixAccumulateBiasesKernel or @ref
- * CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
- * not equal to nullptr)
  *
  * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
  */
@@ -157,40 +152,36 @@ public:
    * @param[in]  input   Source tensor. Data type supported: QASYMM8/F16/F32.
    * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
    *                     If this function is called after a Convolution Layer, the (transposed)
-   * weights will have as many rows as the product of the first 3 input's dimensions.
-   *                     If it is called after another FullyConnected Layer, the (transposed)
-   * weights will have as many rows as the input's first dimension.
-   *                     Data type supported: Same as @p input.
+   * weights will have as many rows as the product of the first 3 input's dimensions. If it is
+   * called after another FullyConnected Layer, the (transposed) weights will have as many rows as
+   * the input's first dimension. Data type supported: Same as @p input.
    * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported:Same as @p input.
    * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix
    * multiplication between:
    *                     - The output of im2col on the input and the (transposed) 2D weights, if the
    * function is called after a Convolution Layer
    *                     - The input tensor and the (transposed) 2D weights, if the function is
-   * called after another FullyConnected Layer.
-   *                     Data type supported: Same as @p input.
+   * called after another FullyConnected Layer. Data type supported: Same as @p input.
    * @param[in]  fc_info (Optional) Fully connected layer additional info
    */
   void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
                  ICLTensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
   /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLFullyConnectedLayerEx
+   * CLFullyConnectedLayer
    *
    * @param[in]  input   Source tensor info. Data type supported: QASYMM8/F16/F32.
    * @param[in]  weights Weights tensor info. The weights must be 2 dimensional.
    *                     If this function is called after a Convolution Layer, the (transposed)
-   * weights will have as many rows as the product of the first 3 input's dimensions.
-   *                     If it is called after another FullyConnected Layer, the (transposed)
-   * weights will have as many rows as the input's first dimension.
-   *                     Data type supported: Same as @p input.
+   * weights will have as many rows as the product of the first 3 input's dimensions. If it is
+   * called after another FullyConnected Layer, the (transposed) weights will have as many rows as
+   * the input's first dimension. Data type supported: Same as @p input.
    * @param[in]  biases  Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
    * @param[out] output  Destination tensor info. Its shape should be equal to the output of a
    * matrix multiplication between:
    *                     - The output of im2col on the input and the (transposed) 2D weights, if the
    * function is called after a Convolution Layer
    *                     - The input tensor and the (transposed) 2D weights, if the function is
-   * called after another FullyConnected Layer.
-   *                     Data type supported: Same as @p input.
+   * called after another FullyConnected Layer. Data type supported: Same as @p input.
    * @param[in]  fc_info (Optional) Fully connected layer additional info
    *
    * @return a status
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
index 167554c9e..385eb0b2c 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
@@ -47,11 +47,14 @@
 #ifndef __ARM_COMPUTE_CLGATHEREX_H__
 #define __ARM_COMPUTE_CLGATHEREX_H__
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /**
  * @brief Class to to run @ref CLGatherKernel.
@@ -81,5 +84,5 @@ public:
   static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
                          const ITensorInfo *output, int axis = 0);
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLGATHEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
index 6618f5aa4..5e172a4c7 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
@@ -78,5 +78,5 @@ public:
   void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput,
                  ICLTensor *output, ICLTensor *hits);
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
index 887e7aaa5..02ae6d719 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
@@ -41,11 +41,14 @@
 #ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__
 #define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to perform a Instance normalization.
  *
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h
index 2bbfca821..62a36f06d 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h
@@ -39,9 +39,11 @@
  */
 #ifndef __ARM_COMPUTE_CLONEHOT_H__
 #define __ARM_COMPUTE_CLONEHOT_H__
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+
 #include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/runtime/IFunction.h"
+
 namespace arm_compute
 {
 class ICLTensor;
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
new file mode 100644
index 000000000..ee1879aaa
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLPADLAYEREX_H
+#define ARM_COMPUTE_CLPADLAYEREX_H
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h"
+#include "src/core/gpu/cl/kernels/ClCopyKernel.h"
+// #include "arm_compute/runtime/CL/functions/CLCopy.h"
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to pad a tensor. This function calls the following OpenCL functions/kernels:
+ *
+ *  -# @ref CLPadLayerKernelEx if there is padding to be added
+ *  -# @ref CLCopyKernel otherwise
+ */
+class CLPadLayerEx : public IFunction
+{
+public:
+  /** Default constructor */
+  CLPadLayerEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPadLayerEx(const CLPadLayerEx &) = delete;
+  /** Default move constructor */
+  CLPadLayerEx(CLPadLayerEx &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPadLayerEx &operator=(const CLPadLayerEx &) = delete;
+  /** Default move assignment operator */
+  CLPadLayerEx &operator=(CLPadLayerEx &&) = default;
+
+  /** Initialize the function
+   *
+   * @param[in]  input          Source tensor. Data types supported: All.
+   * @param[out] output         Output tensor. Data type supported: same as @p input
+   * @param[in]  padding        The padding for each spatial dimension of the input tensor. The pair
+   * padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in]  constant_value (Optional) Constant value to be used for the padding.
+   * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding,
+                 PixelValue constant_value = PixelValue(),
+                 PaddingMode mode = PaddingMode::CONSTANT);
+  /** Initialize the function
+   *
+   * @param[in]  compile_context The compile context to be used.
+   * @param[in]  input           Source tensor. Data types supported: All.
+   * @param[out] output          Output tensor. Data type supported: same as @p input
+   * @param[in]  padding         The padding for each spatial dimension of the input tensor. The
+   * pair padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in]  constant_value  (Optional) Constant value to be used for the padding.
+   * @param[in]  mode            (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output,
+                 const PaddingList &padding, PixelValue constant_value = PixelValue(),
+                 PaddingMode mode = PaddingMode::CONSTANT);
+
+  /**  Static function to check if given info will lead to a valid configuration of @ref
+   * CLPadLayerEx.
+   *
+   * @param[in] input          Source tensor info. Data types supported: All.
+   * @param[in] output         Output tensor info. Data type supported: same as @p input
+   * @param[in] padding        The padding for each spatial dimension of the input tensor. The pair
+   * padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in] constant_value (Optional) Constant value to be used for the padding
+   * @param[in] mode           (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const PaddingList &padding, PixelValue constant_value = PixelValue(),
+                         PaddingMode mode = PaddingMode::CONSTANT);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  void configure_reflect_mode(ICLTensor *input, ICLTensor *output);
+
+  std::unique_ptr<CLPadLayerKernelEx> _pad_kernel;
+  std::unique_ptr<opencl::kernels::ClCopyKernel> _copy_kernel;
+  bool _perform_pad;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLPADLAYEREX_H */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
index bb852e404..45eb72bef 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
@@ -116,5 +116,5 @@ private:
   std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr};
   CLReshapeLayer _reshape;
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h
index bb741d98d..3023df3f0 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h
@@ -46,6 +46,9 @@
 #include <vector>
 #include <memory>
 
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/runtime/CPP/functions/CPPSplit.h"
+
 namespace arm_compute
 {
 class ICLTensor;
@@ -82,5 +85,5 @@ private:
   unsigned int _num_splits;
   std::vector<CLSlice> _slice_functions;
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLSPLITVEX__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
index e301a5152..f426a4d75 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
@@ -160,5 +160,5 @@ private:
   CLTopKV2Store _store_kernel;
 #endif
 };
-}
+} // namespace arm_compute
 #endif // __ARM_COMPUTE_CLTOPK_V2_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
index efc296d6c..d0ddc2609 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
@@ -16,7 +16,6 @@
 #ifndef __ARM_COMPUTE_NEFUNCTIONSEX_H__
 #define __ARM_COMPUTE_NEFUNCTIONSEX_H__
 
-#include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h>
 #include <arm_compute/runtime/NEON/functions/NECastBool.h>
 #include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
index 026d30098..8d931f08d 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
@@ -41,8 +41,10 @@
 #ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
 #define __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/TypesEx.h"
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/core/ITensorInfo.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h
index c8b08af8d..dd62645ee 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h
@@ -41,16 +41,17 @@
 #define __ARM_COMPUTE_NECASTBOOL_H__
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /**
- * @brief Class to run @ref NECastBoolKernel.
+ * @brief Class to run @ref INESimpleFunctionNoBorder.
  */
-class NECastBool : public INESimpleFunction
+class NECastBool : public INESimpleFunctionNoBorder
 {
 public:
   /** Initialize the function's source, destination
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
index 63f7714aa..82a789e86 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
@@ -48,12 +48,14 @@
 #define __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__
 
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/core/Error.h"
 
 #include <vector>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /**
  * @brief Class to perform EmbeddingLookup operation
@@ -84,5 +86,5 @@ public:
   static Status validate(const ITensorInfo *input, const ITensorInfo *output,
                          const ITensorInfo *lookups);
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
index 56548a479..214592710 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
@@ -44,11 +44,11 @@
 #include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "src/core/NEON/kernels/NETransposeKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
index 8f98f220a..2bbb1fea1 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
@@ -43,16 +43,16 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
-#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+#include "src/core/NEON/kernels/NETransposeKernel.h"
 
 namespace arm_compute
 {
@@ -79,11 +79,11 @@ public:
   /** Prevent instances of this class from being copied (As this class contains pointers) */
   NEFullyConnectedLayerEx(const NEFullyConnectedLayerEx &) = delete;
   /** Default move constructor */
-  NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = default;
+  NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = delete;
   /** Prevent instances of this class from being copied (As this class contains pointers) */
   NEFullyConnectedLayerEx &operator=(const NEFullyConnectedLayerEx &) = delete;
   /** Default move assignment operator */
-  NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = default;
+  NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = delete;
   /** Set the input and output tensors.
    *
    * @param[in]  input   Source tensor. Data type supported: QASYMM8/F16/F32.
@@ -141,7 +141,7 @@ private:
   void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
 
   MemoryGroup _memory_group;
-  NEFlattenLayerKernel _flatten_kernel;
+  NEFlattenLayer _flatten_kernel;
   NEConvertFullyConnectedWeights _convert_weights;
   NEFullyConnectedLayerReshapeWeights _reshape_weights_function;
   NEGEMM _mm_gemm;
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
index 155a1b837..6944c77f6 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
@@ -47,6 +47,7 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEGatherKernelEx */
 class NEGatherEx : public INESimpleFunctionNoBorder
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
index 521a05ad9..f6fda60a9 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
@@ -48,12 +48,14 @@
 #define __ARM_COMPUTE_NEHASHTABLELOOKUP_H__
 
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/core/Error.h"
 
 #include <vector>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /**
  * @brief Class to perform HashtableLookup operation
@@ -96,5 +98,5 @@ public:
                          const ITensorInfo *input, const ITensorInfo *output,
                          const ITensorInfo *hits);
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEHASHTABLELOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
index 18e813923..0ee967698 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
@@ -54,6 +54,7 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to perform a Instance normalization.
  *
@@ -112,5 +113,5 @@ private:
   Tensor _permuted_input;
   Tensor _permuted_output;
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
index 1a68f801a..668f024a1 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
@@ -45,6 +45,8 @@ namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
+
 /** Basic function to run @ref NEOneHotKernel */
 class NEOneHot : public INESimpleFunctionNoBorder
 {
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
index 91eec815c..9858e6c09 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
@@ -43,7 +43,7 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
index 48b416923..f34a8f8af 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
@@ -43,11 +43,13 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+#include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
index 7a08dae97..f82579a45 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
@@ -102,9 +102,9 @@ public:
   /** Prevent instances of this class from being copied (As this class contains pointers) */
   NETransposeConvLayer &operator=(const NETransposeConvLayer &) = delete;
   /** Allow instances of this class to be moved */
-  NETransposeConvLayer(NETransposeConvLayer &&) = default;
+  NETransposeConvLayer(NETransposeConvLayer &&) = delete;
   /** Allow instances of this class to be moved */
-  NETransposeConvLayer &operator=(NETransposeConvLayer &&) = default;
+  NETransposeConvLayer &operator=(NETransposeConvLayer &&) = delete;
   /** Default destructor */
   virtual ~NETransposeConvLayer() = default;
 
@@ -171,5 +171,5 @@ private:
   PadStrideInfo _info;
   bool _is_prepared;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 1a8ff3e71..1a180a35b 100644
--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -66,12 +66,16 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
   {"gather_ex_1d", "gather_ex.cl"},
   {"gather_ex_1d_out", "gather_ex.cl"},
   {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"},
+  {"gemm_accumulate_biases", "gemm.cl"},
   {"hashtable_lookup", "hashtable_lookup.cl"},
   {"instance_normalization_ex", "instance_normalization_ex.cl"},
+  {"memset", "memset.cl"},
   {"multiply_scale_factor", "multiply_scale_factor.cl"},
   {"neg_tensor", "neg_tensor.cl"},
   {"one_hot", "one_hot.cl"},
   {"one_hot_only_on_value", "one_hot.cl"},
+  {"pad_layer_constant", "pad_layer.cl"},
+  {"pad_layer_symmetric_reflect", "pad_layer.cl"},
   {"quantization_symm8", "quantization_symm8.cl"},
   {"reduce_min_max", "reduce_operation.cl"},
   {"reduce_sum_mean", "reduce_operation.cl"},
@@ -90,10 +94,18 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
 const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
 #ifdef EMBEDDED_KERNELS
   {
+    "activation_float_helpers.h",
+#include "./cl_kernels/activation_float_helpers.hembed"
+  },
+  {
     "arg_min_max_ex.cl",
 #include "./cl_kernels/arg_min_max_ex.clembed"
   },
   {
+    "binary_logical_op.cl",
+#include "./cl_kernels/binary_logical_op.clembed"
+  },
+  {
     "cast.cl",
 #include "./cl_kernels/cast.clembed"
   },
@@ -110,6 +122,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/gemmlowp_ex.clembed"
   },
   {
+    "gemm_helpers.h",
+#include "./cl_kernels/gemm_helpers.hembed"
+  },
+  {
     "hashtable_lookup.cl",
 #include "./cl_kernels/hashtable_lookup.clembed"
   },
@@ -126,8 +142,12 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/instance_normalization_ex.clembed"
   },
   {
-    "binary_logical_op.cl",
-#include "./cl_kernels/binary_logical_op.clembed"
+    "gemm.cl",
+#include "./cl_kernels/gemm.clembed"
+  },
+  {
+    "memset.cl",
+#include "./cl_kernels/memset.clembed"
   },
   {
     "multiply_scale_factor.cl",
@@ -142,6 +162,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/one_hot.clembed"
   },
   {
+    "pad_layer.cl",
+#include "./cl_kernels/pad_layer.clembed"
+  },
+  {
     "quantization_symm8.cl",
 #include "./cl_kernels/quantization_symm8.clembed"
   },
@@ -150,6 +174,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/reduce_operation.clembed"
   },
   {
+    "repeat.h",
+#include "./cl_kernels/repeat.hembed"
+  },
+  {
     "scale_factor.cl",
 #include "./cl_kernels/scale_factor.clembed"
   },
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h
new file mode 100644
index 000000000..3c3ff8419
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#if GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) (fma(c, b, a))
+#else // GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) ((b) * (c) + (a))
+#endif // GPU_ARCH == GPU_ARCH_BIFROST
+
+// Hard-Swish
+#define hard_swish_op(DATA_TYPE, x, A_VAL, B_VAL) \
+  (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
+
+// Logistic Activation
+#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
+
+// Hyperbolic Tangent Activation
+#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
+
+// RELU Tangent Activation
+#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
+
+// Bounded RELU Activation
+#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
+
+// Lower Upper Bounded RELU Activation
+#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+
+// Leaky RELU Activation
+#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) \
+  ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+
+// Soft RELU Activation
+#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
+
+// ELU Activation
+#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) \
+  (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0)))
+
+// Absolute Activation
+#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x))
+
+// Square Activation
+#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x)
+
+// Square-root Activation
+#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x))
+
+// Linear Activation
+#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
+
+// Identity Activation
+#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x)
+
+#define ACT_OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL)
+
+#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, x, A_VAL, B_VAL)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl
new file mode 100644
index 000000000..9b826a2bd
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl
@@ -0,0 +1,7210 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "repeat.h"
+
+#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
+#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
+#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
+#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
+#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+#define CONCAT_INC(K0) INC##K0
+#define INC(K0) CONCAT_INC(K0)
+
+#if (SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a)                                                                \
+  ({                                                                                              \
+    a = select(                                                                                   \
+      0, a,                                                                                       \
+      CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), \
+              VEC_DATA_TYPE(DATA_TYPE, K0)));                                                     \
+  })
+#else // (SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a) ({})
+#endif // (SRC_WIDTH % K0)
+
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks
+ * of size M0xK0 and stores each one (not transposed) in the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g.
+ * -DSRC_WIDTH=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g.
+ * -DM0=2, -DK0=2).
+ * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at
+ * compile time using -DV0 (e.g. -DV0=2)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,5,6,7,8
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer
+ * 1x1), the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile
+ * time.
+ *
+ * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_INPUT_AS_3D)
+ */
+__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                                      ,
+                                         uint cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+)
+{
+  // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+  // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+  // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+  // Compute source and destination addresses
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+  // ------------------ Compute input/output addresses ---------------------------
+
+  // Compute the input address
+  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes +
+                              x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
+
+  // Compute the output address
+  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+                               (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) +
+                               ((y / (uint)V0) * (uint)dst_stride_y) +
+                               ((y % V0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
+
+  // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src_stride_z by DEPTH_GEMM3D
+
+  input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
+
+  // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  input_ptr += z * (uint)src_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  output_ptr += z * (uint)dst_stride_z;
+
+  // ---------------------------Load input values --------------------------------
+  // Load values from the LHS matrix
+  LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+  BOUNDARY_CONDITION_X(x, a0);
+#if M0 > 1
+  BOUNDARY_CONDITION_X(x, a1);
+#endif // M0 > 1
+#if M0 > 2
+  BOUNDARY_CONDITION_X(x, a2);
+#endif // M0 > 2
+#if M0 > 3
+  BOUNDARY_CONDITION_X(x, a3);
+#endif // M0 > 3
+#if M0 > 4
+  BOUNDARY_CONDITION_X(x, a4);
+#endif // M0 > 4
+#if M0 > 5
+  BOUNDARY_CONDITION_X(x, a5);
+#endif // M0 > 5
+#if M0 > 6
+  BOUNDARY_CONDITION_X(x, a6);
+#endif // M0 > 6
+#if M0 > 7
+  BOUNDARY_CONDITION_X(x, a7);
+#endif // M0 > 7
+  // ---------------------------Store output values ------------------------------
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+  STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#if M0 == 2
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                              \
+  ({                                                                                          \
+    VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+    res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i);                                   \
+    VSTORE(M0)                                                                                \
+    (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+  })
+#elif M0 == 3 // M0 == 3
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                              \
+  ({                                                                                          \
+    VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+    res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i);                          \
+    VSTORE(M0)                                                                                \
+    (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+  })
+#elif M0 == 4 // M0 == 4
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                              \
+  ({                                                                                          \
+    VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+    res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                 \
+    VSTORE(M0)                                                                                \
+    (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+  })
+#elif M0 == 5 // M0 == 5
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
+  ({                                                                                              \
+    VEC_DATA_TYPE(DATA_TYPE, 4)                                                                   \
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                     \
+    DATA_TYPE res1 = a4.s##i;                                                                     \
+    VSTORE(4)                                                                                     \
+    (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));    \
+    *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
+  })
+#elif M0 == 6 // M0 == 6
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                   \
+  ({                                                                                               \
+    VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
+    VEC_DATA_TYPE(DATA_TYPE, 2)                                                                    \
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i);                                        \
+    VSTORE(4)                                                                                      \
+    (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
+    VSTORE(2)                                                                                      \
+    (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
+  })
+#elif M0 == 7 // M0 == 7
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                   \
+  ({                                                                                               \
+    VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
+    VEC_DATA_TYPE(DATA_TYPE, 3)                                                                    \
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i);                               \
+    VSTORE(4)                                                                                      \
+    (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
+    VSTORE(3)                                                                                      \
+    (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
+  })
+#elif M0 == 8 // M0 == 8
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                               \
+  ({                                                                                           \
+    VEC_DATA_TYPE(DATA_TYPE, M0)                                                               \
+    res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, \
+                                         a6.s##i, a7.s##i);                                    \
+    VSTORE(M0)                                                                                 \
+    (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));  \
+  })
+#else // M0 not supported
+#error "M0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks
+ * of size M0xK0 and stores each one (transposed) in the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g.
+ * -DSRC_WIDTH=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g.
+ * -DM0=2, -DK0=2).
+ * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at
+ * compile time using -DV0 (e.g. -DV0=2)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,5,6,7,8
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer
+ * 1x1), the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile
+ * time.
+ *
+ * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_INPUT_AS_3D)
+ */
+__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                                     ,
+                                        uint cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+)
+{
+  // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+  // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (M0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+  // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (M0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (M0)
+#endif // defined(INTERLEAVE)
+
+  // Compute source and destination addresses
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+  // ------------------ Compute input/output addresses ---------------------------
+
+  // Compute the input address
+  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes +
+                              x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
+
+  // Compute the output address
+  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+                               (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) +
+                               ((y / (uint)V0) * (uint)dst_stride_y) +
+                               ((y % V0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
+
+  // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src_stride_z by DEPTH_GEMM3D
+
+  input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
+
+  // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  input_ptr += z * (uint)src_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  output_ptr += z * (uint)dst_stride_z;
+
+  // ---------------------------Load input values --------------------------------
+
+  // Load values from the LHS matrix
+  LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+  BOUNDARY_CONDITION_X(x, a0);
+#if M0 > 1
+  BOUNDARY_CONDITION_X(x, a1);
+#endif // M0 > 1
+#if M0 > 2
+  BOUNDARY_CONDITION_X(x, a2);
+#endif // M0 > 2
+#if M0 > 3
+  BOUNDARY_CONDITION_X(x, a3);
+#endif // M0 > 3
+#if M0 > 4
+  BOUNDARY_CONDITION_X(x, a4);
+#endif // M0 > 4
+#if M0 > 5
+  BOUNDARY_CONDITION_X(x, a5);
+#endif // M0 > 5
+#if M0 > 6
+  BOUNDARY_CONDITION_X(x, a6);
+#endif // M0 > 6
+#if M0 > 7
+  BOUNDARY_CONDITION_X(x, a7);
+#endif // M0 > 7
+  // ---------------------------Transpose and store block -----------------------
+
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
+#if K0 > 2
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
+#endif // K0 > 2
+#if K0 > 3
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
+#endif // K0 > 3
+#if K0 > 4
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
+#endif // K0 > 4
+#if K0 > 8
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
+#endif // K0 > 8
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+
+#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks
+ * of size K0xN0 and stores each one (not transposed) in the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g.
+ * -DSRC_HEIGHT=16)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g.
+ * -DK0=2, -DN0=2).
+ * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at
+ * compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile
+ * time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 1,2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+  // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+  // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (N0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+  // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (N0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (N0)
+#endif // defined(INTERLEAVE)
+
+  // Compute source and destination addresses
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+  // ------------------ Compute input/output addresses ---------------------------
+
+  // Compute the input address
+  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes +
+                              x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y +
+                              z * (uint)src_stride_z;
+
+  // Compute the output address
+  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+                               (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) +
+                               ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) +
+                               ((x / (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
+
+  // ---------------------------Load input values --------------------------------
+
+  REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a,
+                           0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;
+
+  // Load values from the RHS matrix
+  a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+#if K0 > 1
+  if (y * (uint)K0 + 1 < SRC_HEIGHT)
+  {
+    a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+  }
+#endif // K0 > 1
+#if K0 > 2
+  if (y * (uint)K0 + 2 < SRC_HEIGHT)
+  {
+    a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+  }
+#endif // K0 > 2
+#if K0 > 3
+  if (y * (uint)K0 + 3 < SRC_HEIGHT)
+  {
+    a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
+  }
+#endif // K0 > 3
+#if K0 > 4
+  if (y * (uint)K0 + 4 < SRC_HEIGHT)
+  {
+    a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
+  }
+  if (y * (uint)K0 + 5 < SRC_HEIGHT)
+  {
+    a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
+  }
+  if (y * (uint)K0 + 6 < SRC_HEIGHT)
+  {
+    a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
+  }
+  if (y * (uint)K0 + 7 < SRC_HEIGHT)
+  {
+    a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
+  }
+#endif // K0 > 4
+#if K0 > 8
+  if (y * (uint)K0 + 8 < SRC_HEIGHT)
+  {
+    a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
+  }
+  if (y * (uint)K0 + 9 < SRC_HEIGHT)
+  {
+    a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
+  }
+  if (y * (uint)K0 + 10 < SRC_HEIGHT)
+  {
+    aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
+  }
+  if (y * (uint)K0 + 11 < SRC_HEIGHT)
+  {
+    aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
+  }
+  if (y * (uint)K0 + 12 < SRC_HEIGHT)
+  {
+    aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
+  }
+  if (y * (uint)K0 + 13 < SRC_HEIGHT)
+  {
+    aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
+  }
+  if (y * (uint)K0 + 14 < SRC_HEIGHT)
+  {
+    aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
+  }
+  if (y * (uint)K0 + 15 < SRC_HEIGHT)
+  {
+    aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
+  }
+#endif // K0 > 8
+
+  // ---------------------------Store output values ------------------------------
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+  STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#if defined(TRANSPOSE)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks
+ * of size K0xN0 and stores each one (transposed) in the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g.
+ * -DSRC_HEIGHT=16)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g.
+ * -DK0=2, -DN0=2).
+ * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at
+ * compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile
+ * time.
+ * @note The option -DTRANSPOSE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+  // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+  // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+  // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+  // Compute source and destination addresses
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+  // ------------------ Compute input/output addresses ---------------------------
+
+  // Compute the input address
+  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes +
+                              x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y +
+                              z * (uint)src_stride_z;
+
+  // Compute the output address
+  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+                               (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) +
+                               ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) +
+                               ((x / (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
+
+  // ---------------------------Load input values --------------------------------
+  REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a,
+                           0); // VEC_DATA_TYPE(DATA_TYPE, N0)    a0=0, a1=0, ... a(K0-1)=0;
+
+  // Load values from the RHS matrix
+  a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+  if (y * (uint)K0 + 1 < SRC_HEIGHT)
+  {
+    a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+  }
+#if K0 > 2
+  if (y * (uint)K0 + 2 < SRC_HEIGHT)
+  {
+    a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+  }
+#endif // K0 > 2
+#if K0 > 3
+  if (y * (uint)K0 + 3 < SRC_HEIGHT)
+  {
+    a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
+  }
+#endif // K0 > 3
+#if K0 > 4
+  if (y * (uint)K0 + 4 < SRC_HEIGHT)
+  {
+    a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
+  }
+  if (y * (uint)K0 + 5 < SRC_HEIGHT)
+  {
+    a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
+  }
+  if (y * (uint)K0 + 6 < SRC_HEIGHT)
+  {
+    a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
+  }
+  if (y * (uint)K0 + 7 < SRC_HEIGHT)
+  {
+    a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
+  }
+#endif // K0 > 4
+#if K0 > 8
+  if (y * (uint)K0 + 8 < SRC_HEIGHT)
+  {
+    a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
+  }
+  if (y * (uint)K0 + 9 < SRC_HEIGHT)
+  {
+    a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
+  }
+  if (y * (uint)K0 + 10 < SRC_HEIGHT)
+  {
+    aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
+  }
+  if (y * (uint)K0 + 11 < SRC_HEIGHT)
+  {
+    aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
+  }
+  if (y * (uint)K0 + 12 < SRC_HEIGHT)
+  {
+    aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
+  }
+  if (y * (uint)K0 + 13 < SRC_HEIGHT)
+  {
+    aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
+  }
+  if (y * (uint)K0 + 14 < SRC_HEIGHT)
+  {
+    aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
+  }
+  if (y * (uint)K0 + 15 < SRC_HEIGHT)
+  {
+    aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
+  }
+#endif // K0 > 8
+
+  // ---------------------------Transpose the block ------------------------------
+  REPEAT_VAR_INIT_TO_CONST(
+    N0, VEC_DATA_TYPE(DATA_TYPE, K0), res,
+    0); // VEC_DATA_TYPE(DATA_TYPE, K0)    res0=0, res1=0, res2=0,... res(N0-1)=0;
+
+#if K0 == 2
+  // This part computes the following transpositions:
+  // 2x2 -> 2x2
+  // 2x4 -> 4x2
+  // 2x8 -> 8x2
+  // 2x16 -> 16x2
+  res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);
+  res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);
+#if N0 > 2
+  res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);
+#endif // N0 > 2
+#if N0 > 3
+  res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);
+#endif // N0 > 3
+#if N0 > 4
+  res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);
+  res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);
+  res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);
+  res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);
+#endif // N0 > 4
+#if N0 > 8
+  res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);
+  res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);
+  resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);
+  resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);
+  resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);
+  resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);
+  resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);
+  resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);
+#endif // N0 > 8
+
+#elif K0 == 3 // K0 == 2
+  // This part computes the following transpositions:
+  // 3x2 -> 2x3
+  // 3x4 -> 4x3
+  // 3x8 -> 8x3
+  // 3x16 -> 16x3
+  res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);
+  res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);
+#if N0 > 2
+  res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);
+#endif // N0 > 2
+#if N0 > 3
+  res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);
+#endif // N0 > 3
+#if N0 > 4
+  res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);
+  res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);
+  res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);
+  res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);
+#endif // N0 > 4
+#if N0 > 8
+  res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);
+  res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);
+  resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);
+  resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);
+  resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);
+  resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);
+  resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);
+  resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);
+#endif // N0 > 8
+
+#elif K0 == 4 // K0 == 4
+  // This part computes the following transpositions:
+  // 4x2 -> 2x4
+  // 4x4 -> 4x4
+  // 4x8 -> 8x4
+  // 4x16 -> 16x4
+  res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);
+  res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);
+#if N0 > 2
+  res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);
+#endif // N0 > 2
+#if N0 > 3
+  res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);
+#endif // N0 > 3
+#if N0 > 4
+  res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);
+  res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);
+  res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);
+  res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);
+#endif // N0 > 4
+#if N0 > 8
+  res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);
+  res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);
+  resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);
+  resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);
+  resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);
+  resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);
+  resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);
+  resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);
+#endif // N0 > 8
+
+#elif K0 == 8 // K0 == 8
+  // This part computes the following transpositions:
+  // 8x2 -> 2x8
+  // 8x4 -> 4x8
+  // 8x8 -> 8x8
+  // 8x16 -> 16x8
+  res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);
+  res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);
+#if N0 > 2
+  res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);
+#endif // N0 > 2
+#if N0 > 3
+  res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);
+#endif // N0 > 3
+#if N0 > 4
+  res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);
+  res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);
+  res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);
+  res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);
+#endif // N0 > 4
+#if N0 > 8
+  res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);
+  res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);
+  resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);
+  resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);
+  resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);
+  resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);
+  resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);
+  resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);
+#endif // N0 > 8
+
+#elif K0 == 16 // K0 == 16
+
+  // This part computes the following transpositions:
+  // 16x2 -> 2x16
+  // 16x4 -> 4x16
+  // 16x8 -> 8x16
+  // 16x16 -> 16x16
+  res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,
+                                        a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);
+  res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,
+                                        a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);
+#if N0 > 2
+  res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,
+                                        a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);
+#endif // N0 > 2
+#if N0 > 3
+  res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,
+                                        a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);
+#endif // N0 > 3
+#if N0 > 4
+  res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,
+                                        a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);
+  res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,
+                                        a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);
+  res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,
+                                        a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);
+  res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,
+                                        a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);
+#endif // N0 > 4
+#if N0 > 8
+  res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,
+                                        a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);
+  res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,
+                                        a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);
+  resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,
+                                        a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);
+  resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,
+                                        a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);
+  resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,
+                                        a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);
+  resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,
+                                        a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);
+  resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,
+                                        a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);
+  resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,
+                                        a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);
+#endif // N0 > 8
+
+#else // N0 == 16
+#error "Not supported N0 value"
+#endif // N0 > 2
+
+  // ---------------------------Store the output values ------------------------------
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+  STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(TRANSPOSE)
+#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && \
+  defined(M) && defined(N) && defined(K)
+
+#define CONCAT(a, b) a##b
+
+#define ARM_DOT1(a, b, c) ({ c = fma(a, b, c); })
+#define ARM_DOT2(a, b, c)   \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+  })
+#define ARM_DOT3(a, b, c)       \
+  ({                            \
+    ARM_DOT2(a, b, c);          \
+    c = fma((a.s2), (b.s2), c); \
+  })
+#define ARM_DOT4(a, b, c)       \
+  ({                            \
+    ARM_DOT3(a, b, c);          \
+    c = fma((a.s3), (b.s3), c); \
+  })
+#define ARM_DOT8(a, b, c)        \
+  ({                             \
+    ARM_DOT4((a.lo), (b.lo), c); \
+    ARM_DOT4((a.hi), (b.hi), c); \
+  })
+#define ARM_DOT16(a, b, c)       \
+  ({                             \
+    ARM_DOT8((a.lo), (b.lo), c); \
+    ARM_DOT8((a.hi), (b.hi), c); \
+  })
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+  ({                               \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##0), (c.s0));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##1), (c.s1));         \
+  })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+  ({                               \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##0), (c.s0));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##1), (c.s1));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##2), (c.s2));         \
+  })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+  ({                               \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##0), (c.s0));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##1), (c.s1));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##2), (c.s2));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##3), (c.s3));         \
+  })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+  ({                               \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##0), (c.s0));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##1), (c.s1));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##2), (c.s2));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##3), (c.s3));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##4), (c.s4));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##5), (c.s5));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##6), (c.s6));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##7), (c.s7));         \
+  })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+  ({                               \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##0), (c.s0));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##1), (c.s1));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##2), (c.s2));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##3), (c.s3));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##4), (c.s4));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##5), (c.s5));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##6), (c.s6));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##7), (c.s7));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##8), (c.s8));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##9), (c.s9));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##A), (c.sA));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##B), (c.sB));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##C), (c.sC));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##D), (c.sD));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##E), (c.sE));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##F), (c.sF));         \
+  })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK
+ * (e.g. -DM=52, -DN=30 and -DK=90)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at
+ * compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type:
+ * F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS
+ * reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix
+ * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                          IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                          IMAGE_DECLARATION(dst), uint lhs_stride_z,
+                                          uint rhs_stride_z,
+#if defined(BETA)
+                                          uint bias_stride_z,
+#endif // defined(BETA)
+                                          uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                          ,
+                                          uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                          ,
+                                          uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS reshaped matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes +
+                    (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) +
+                    (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+  int i = 0;
+  for (; i <= (K - K0); i += K0)
+  {
+    // Supported cases (M0, K0):
+    // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+    // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+    // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+    // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+    // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+    // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+    // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+    // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS reshaped matrix
+    LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+    // Accumulate
+    ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+    ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+    ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+    ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+    ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+    ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+    ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+    ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+    lhs_offset += K0 * sizeof(DATA_TYPE);
+    rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+  }
+
+  // Left-over accumulations
+  for (; i < K; ++i)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS reshaped matrix
+    LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+    // Accumulate
+    ARM_DOT_K0XN0(1, a0, b, c0);
+#if M0 > 1
+    ARM_DOT_K0XN0(1, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+    ARM_DOT_K0XN0(1, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+    ARM_DOT_K0XN0(1, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+    ARM_DOT_K0XN0(1, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+    ARM_DOT_K0XN0(1, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+    ARM_DOT_K0XN0(1, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+    ARM_DOT_K0XN0(1, a7, b, c7);
+#endif // M0 > 7
+
+    lhs_offset += sizeof(DATA_TYPE);
+    rhs_offset += sizeof(DATA_TYPE);
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+                              (get_global_id(1) * (uint)M0 * bias_stride_y) +
+                              get_global_id(2) * bias_stride_z;
+
+  LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+  STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#define VFMA(a, b, c) ({ c = fma(a, b, c); })
+
+#if M0 == 1
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+  })
+#elif M0 == 2 // M0 == 2
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+  })
+#elif M0 == 3 // M0 == 3
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+  })
+#elif M0 == 4 // M0 == 4
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                \
+  })
+#elif M0 == 5 // M0 == 5
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                \
+  })
+#elif M0 == 6 // M0 == 6
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                \
+  })
+#elif M0 == 7 // M0 == 7
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6));                                \
+  })
+#elif M0 == 8 // M0 == 8
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7));                                \
+  })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK
+ * (e.g. -DM=52, -DN=30 and -DK=90).
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at
+ * compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type:
+ * F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS
+ * reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix
+ * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                           IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                           IMAGE_DECLARATION(dst), uint lhs_stride_z,
+                                           uint rhs_stride_z,
+#if defined(BETA)
+                                           uint bias_stride_z,
+#endif // defined(BETA)
+                                           uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                           ,
+                                           uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                           ,
+                                           uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS reshaped matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes +
+                    (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) +
+                    (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);   // uint zin0=0,zin1=0,zin2=0,... zin7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); // uint zero0=0,zero1=0,zero2=0,... zero7=0;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+
+  // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+  int i = 0;
+  for (; i <= (K - K0); i += K0)
+  {
+    // Supported cases (M0, K0):
+    // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+    // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+    // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+    // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+    // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+    // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+    // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+    // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
+
+    LD_RHS_VFMA_M0xN0(0, a, c);
+    LD_RHS_VFMA_M0xN0(1, a, c);
+#if K0 > 2
+    LD_RHS_VFMA_M0xN0(2, a, c);
+#endif // K0 > 2
+#if K0 > 3
+    LD_RHS_VFMA_M0xN0(3, a, c);
+#endif // K0 > 3
+#if K0 > 4
+    LD_RHS_VFMA_M0xN0(4, a, c);
+    LD_RHS_VFMA_M0xN0(5, a, c);
+    LD_RHS_VFMA_M0xN0(6, a, c);
+    LD_RHS_VFMA_M0xN0(7, a, c);
+#endif // K0 > 4
+#if K0 > 8
+    LD_RHS_VFMA_M0xN0(8, a, c);
+    LD_RHS_VFMA_M0xN0(9, a, c);
+    LD_RHS_VFMA_M0xN0(A, a, c);
+    LD_RHS_VFMA_M0xN0(B, a, c);
+    LD_RHS_VFMA_M0xN0(C, a, c);
+    LD_RHS_VFMA_M0xN0(D, a, c);
+    LD_RHS_VFMA_M0xN0(E, a, c);
+    LD_RHS_VFMA_M0xN0(F, a, c);
+#endif // K0 > 8
+
+    lhs_offset += K0 * sizeof(DATA_TYPE);
+    rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
+  }
+
+  // Left-over accumulations
+  for (; i < K; ++i)
+  {
+    // Load values from LHS matrix
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+    LD_RHS_VFMA_M0xN0(0, a, c);
+
+    lhs_offset += sizeof(DATA_TYPE);
+    rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+                              (get_global_id(1) * (uint)M0 * bias_stride_y) +
+                              get_global_id(2) * bias_stride_z;
+
+  LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+  STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) &&
+       // defined(M) && defined(N) && defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && \
+  defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
+
+#if defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c += a.s0 * b.s0;       \
+    c += a.s1 * b.s1;       \
+  })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c += a.s0 * b.s0;       \
+    c += a.s1 * b.s1;       \
+    c += a.s2 * b.s2;       \
+  })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c += a.s0 * b.s0;       \
+    c += a.s1 * b.s1;       \
+    c += a.s2 * b.s2;       \
+    c += a.s3 * b.s3;       \
+  })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c += a.s0 * b.s0;       \
+    c += a.s1 * b.s1;       \
+    c += a.s2 * b.s2;       \
+    c += a.s3 * b.s3;       \
+    c += a.s4 * b.s4;       \
+    c += a.s5 * b.s5;       \
+    c += a.s6 * b.s6;       \
+    c += a.s7 * b.s7;       \
+  })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c += a.s0 * b.s0;       \
+    c += a.s1 * b.s1;       \
+    c += a.s2 * b.s2;       \
+    c += a.s3 * b.s3;       \
+    c += a.s4 * b.s4;       \
+    c += a.s5 * b.s5;       \
+    c += a.s6 * b.s6;       \
+    c += a.s7 * b.s7;       \
+    c += a.s8 * b.s8;       \
+    c += a.s9 * b.s9;       \
+    c += a.sA * b.sA;       \
+    c += a.sB * b.sB;       \
+    c += a.sC * b.sC;       \
+    c += a.sD * b.sD;       \
+    c += a.sE * b.sE;       \
+    c += a.sF * b.sF;       \
+  })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#else  // defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+  })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+    c = fma(a.s2, b.s2, c); \
+  })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+    c = fma(a.s2, b.s2, c); \
+    c = fma(a.s3, b.s3, c); \
+  })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+    c = fma(a.s2, b.s2, c); \
+    c = fma(a.s3, b.s3, c); \
+    c = fma(a.s4, b.s4, c); \
+    c = fma(a.s5, b.s5, c); \
+    c = fma(a.s6, b.s6, c); \
+    c = fma(a.s7, b.s7, c); \
+  })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+    c = fma(a.s2, b.s2, c); \
+    c = fma(a.s3, b.s3, c); \
+    c = fma(a.s4, b.s4, c); \
+    c = fma(a.s5, b.s5, c); \
+    c = fma(a.s6, b.s6, c); \
+    c = fma(a.s7, b.s7, c); \
+    c = fma(a.s8, b.s8, c); \
+    c = fma(a.s9, b.s9, c); \
+    c = fma(a.sA, b.sA, c); \
+    c = fma(a.sB, b.sB, c); \
+    c = fma(a.sC, b.sC, c); \
+    c = fma(a.sD, b.sD, c); \
+    c = fma(a.sE, b.sE, c); \
+    c = fma(a.sF, b.sF, c); \
+  })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#endif // defined(MIXED_PRECISION)
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(a, b, c)       \
+  ({                                 \
+    ARM_DOT_K0((a), (b##0), (c.s0)); \
+    ARM_DOT_K0((a), (b##1), (c.s1)); \
+  })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(a, b, c)       \
+  ({                                 \
+    ARM_DOT_K0((a), (b##0), (c.s0)); \
+    ARM_DOT_K0((a), (b##1), (c.s1)); \
+    ARM_DOT_K0((a), (b##2), (c.s2)); \
+  })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(a, b, c)       \
+  ({                                 \
+    ARM_DOT_K0((a), (b##0), (c.s0)); \
+    ARM_DOT_K0((a), (b##1), (c.s1)); \
+    ARM_DOT_K0((a), (b##2), (c.s2)); \
+    ARM_DOT_K0((a), (b##3), (c.s3)); \
+  })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(a, b, c)       \
+  ({                                 \
+    ARM_DOT_K0((a), (b##0), (c.s0)); \
+    ARM_DOT_K0((a), (b##1), (c.s1)); \
+    ARM_DOT_K0((a), (b##2), (c.s2)); \
+    ARM_DOT_K0((a), (b##3), (c.s3)); \
+    ARM_DOT_K0((a), (b##4), (c.s4)); \
+    ARM_DOT_K0((a), (b##5), (c.s5)); \
+    ARM_DOT_K0((a), (b##6), (c.s6)); \
+    ARM_DOT_K0((a), (b##7), (c.s7)); \
+  })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(a, b, c)       \
+  ({                                 \
+    ARM_DOT_K0((a), (b##0), (c.s0)); \
+    ARM_DOT_K0((a), (b##1), (c.s1)); \
+    ARM_DOT_K0((a), (b##2), (c.s2)); \
+    ARM_DOT_K0((a), (b##3), (c.s3)); \
+    ARM_DOT_K0((a), (b##4), (c.s4)); \
+    ARM_DOT_K0((a), (b##5), (c.s5)); \
+    ARM_DOT_K0((a), (b##6), (c.s6)); \
+    ARM_DOT_K0((a), (b##7), (c.s7)); \
+    ARM_DOT_K0((a), (b##8), (c.s8)); \
+    ARM_DOT_K0((a), (b##9), (c.s9)); \
+    ARM_DOT_K0((a), (b##A), (c.sA)); \
+    ARM_DOT_K0((a), (b##B), (c.sB)); \
+    ARM_DOT_K0((a), (b##C), (c.sC)); \
+    ARM_DOT_K0((a), (b##D), (c.sD)); \
+    ARM_DOT_K0((a), (b##E), (c.sE)); \
+    ARM_DOT_K0((a), (b##F), (c.sF)); \
+  })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT
+ * transposed The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0
+ * must be transposed
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The data type used for the accumulators must be passed at compile time using
+ * -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
+ * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION
+ * passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52
+ * and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0)
+ * must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS
+ * matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option
+ * -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data
+ * type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS
+ * reshaped matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS
+ * reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS
+ * matrix not reshaped.
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                            IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                            IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z,
+                                            uint rhs_stride_z,
+#if defined(BETA)
+                                            uint bias_stride_z,
+#endif // defined(BETA)
+                                            uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                            ,
+                                            uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes +
+                             (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) +
+                             (get_global_id(1) / V0) * (uint)lhs_stride_y +
+                             (get_global_id(2) * lhs_stride_z);
+
+  // Compute RHS matrix address
+  __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes +
+                             (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) +
+                             (get_global_id(0) / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_addr += get_global_id(2) * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); // uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+  for (int i = 0; i < k; i += K0)
+  {
+    // Supported cases (M0, K0):
+    // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+    // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+    // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+    // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+    // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+    // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+    // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+    // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+    // Accumulate
+    ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+    ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+    ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+    ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+    ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+    ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+    ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+    ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+    lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
+    rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+                             (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D,
+                     dst_cross_plane_pad, dst_stride_y);
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+  CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+  ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+  ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+                              (get_global_id(1) * (uint)M0 * bias_stride_y) +
+                              get_global_id(2) * bias_stride_z;
+
+  LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+#if defined(MIXED_PRECISION)
+  CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+  ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+  ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+#if defined(MIXED_PRECISION)
+  CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#else  // defined(MIXED_PRECISION)
+  STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#if defined(LHS_TRANSPOSE)
+
+#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
+
+#if defined(MIXED_PRECISION)
+
+#if (GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c)                                   \
+  c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * \
+       (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c)                                     \
+  c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), \
+          (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#else // defined(MIXED_PRECISION
+
+#if (GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#endif // defined(MIXED_PRECISION)
+
+#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) ({ ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); })
+#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C)        \
+  ({                                                  \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
+  })
+#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C)        \
+  ({                                                  \
+    ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C);           \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
+  })
+#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C)        \
+  ({                                                  \
+    ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C);           \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
+  })
+#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C)        \
+  ({                                                  \
+    ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C);           \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
+  })
+
+// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication.
+// K0 = 1 a is the column-vector (transposed) b is the row-vector (not transposed) C is the output
+// matrix Lower case is a vector (a, b) Upper case is a matrix (C)
+#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
+
+#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \
+  ({ ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); })
+#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C)         \
+  ({                                                       \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C);            \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
+  })
+#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C)         \
+  ({                                                       \
+    ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C);            \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
+  })
+#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C)         \
+  ({                                                       \
+    ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C);            \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
+  })
+#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C)         \
+  ({                                                       \
+    ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C);            \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
+  })
+#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C)       \
+  ({                                                      \
+    ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C);           \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
+  })
+
+// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
+// The dimensions for this matrix multiplications are defined through M0, N0 and K0
+// The dimensions supported are:
+// M0: 1, 2, 3, 4, 8
+// N0: 1, 2, 3, 4, 8, 16
+// K0: 1, 2, 3, 4, 8, 16
+// This macro calls the vector-by-matrix macro K0 times
+// A, B and C are matrices
+#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
+  CONCAT(ARM_MM_T_NT_M0xN0x, K0)               \
+  (M0, N0, TYPE, A, B, C)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be
+ * transposed The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0
+ * must be NOT transposed
+ *
+ * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g.
+ * -DLHS_TRANSPOSE).
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52
+ * and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0)
+ * must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS
+ * matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option
+ * -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data
+ * type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS
+ * reshaped matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS
+ * reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS
+ * matrix not reshaped.
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                            IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                            IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z,
+                                            uint rhs_stride_z,
+#if defined(BETA)
+                                            uint bias_stride_z,
+#endif // defined(BETA)
+                                            uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                            ,
+                                            uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (M0)
+#define LHS_STEP_X ((M0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (M0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#endif // defined(RHS_INTERLEAVE)
+
+  const uint x = get_global_id(0);
+  const uint y = get_global_id(1);
+  const uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes +
+                             (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) +
+                             (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
+
+  // Compute RHS matrix address
+  __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes +
+                             (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) +
+                             (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_addr += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+  __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
+  __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
+
+  for (int i = 0; i < k; i += K0)
+  {
+    VEC_DATA_TYPE(DATA_TYPE, M0)
+    a0 = VLOAD(M0)(0, lhs);
+    VEC_DATA_TYPE(DATA_TYPE, N0)
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+#if K0 > 1
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+#endif // K0 > 1
+
+#if K0 > 2
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+#endif // K0 > 2
+
+#if K0 > 3
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+#endif // K0 > 3
+
+#if K0 > 4
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+#endif // K0 > 4
+
+#if K0 > 8
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+#endif // K0 > 8
+
+#ifndef LHS_INTERLEAVE
+    lhs += (M0 * K0 * (V0 - 1));
+#endif // LHS_INTERLEAVE
+
+#ifndef RHS_INTERLEAVE
+    rhs += (N0 * K0 * (H0 - 1));
+#endif // RHS_INTERLEAVE
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+  __global uchar *bias_addr =
+    bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+  CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+  ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+  ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) +
+                              z * bias_stride_z;
+
+  LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+#if defined(MIXED_PRECISION)
+  CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+  ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+  ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+#if defined(MIXED_PRECISION)
+  CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#else  // defined(MIXED_PRECISION)
+  STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#endif // defined(LHS_TRANSPOSE)
+
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) &&
+       // defined(DATA_TYPE)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
+
+#define VFMA(a, b, c) ({ c = fma(a, b, c); })
+
+#if M0 == 1
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+  ({ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); })
+#elif M0 == 2 // M0 == 2
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+  })
+#elif M0 == 3 // M0 == 3
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+  })
+#elif M0 == 4 // M0 == 4
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+  })
+#elif M0 == 5 // M0 == 5
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+  })
+#elif M0 == 6 // M0 == 6
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+  })
+#elif M0 == 7 // M0 == 7
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+  })
+#elif M0 == 8 // M0 == 8
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
+  })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is NOT reshaped
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK
+ * (e.g. -DM=52, -DN=30 and -DK=90)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g.,
+ * -DK0=2)
+ * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type:
+ * F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         lhs_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         lhs_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS matrix. Supported data type:
+ * same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         rhs_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         rhs_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix
+ * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                             IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                             IMAGE_DECLARATION(dst), uint lhs_stride_z, uint rhs_stride_z,
+#if defined(BETA)
+                             uint bias_stride_z,
+#endif // defined(BETA)
+                             uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                             ,
+                             uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                             ,
+                             uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+  int i = 0;
+  for (; i <= (K - K0); i += K0)
+  {
+    // Supported cases (M0, K0):
+    // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+    // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+    // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+    // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+    // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+    // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+    // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+    // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
+
+    RHS_VFMA_M0xN0(0, a, b0, c);
+    RHS_VFMA_M0xN0(1, a, b1, c);
+#if K0 > 2
+    RHS_VFMA_M0xN0(2, a, b2, c);
+#endif // K0 > 2
+#if K0 > 3
+    RHS_VFMA_M0xN0(3, a, b3, c);
+#endif // K0 > 3
+#if K0 > 4
+    RHS_VFMA_M0xN0(4, a, b4, c);
+    RHS_VFMA_M0xN0(5, a, b5, c);
+    RHS_VFMA_M0xN0(6, a, b6, c);
+    RHS_VFMA_M0xN0(7, a, b7, c);
+#endif // K0 > 4
+#if K0 > 8
+    RHS_VFMA_M0xN0(8, a, b8, c);
+    RHS_VFMA_M0xN0(9, a, b9, c);
+    RHS_VFMA_M0xN0(A, a, bA, c);
+    RHS_VFMA_M0xN0(B, a, bB, c);
+    RHS_VFMA_M0xN0(C, a, bC, c);
+    RHS_VFMA_M0xN0(D, a, bD, c);
+    RHS_VFMA_M0xN0(E, a, bE, c);
+    RHS_VFMA_M0xN0(F, a, bF, c);
+#endif // K0 > 8
+
+    lhs_offset += K0 * sizeof(DATA_TYPE);
+    rhs_offset += K0 * rhs_stride_y;
+  }
+
+  // Left-over accumulations
+  for (; i < K; ++i)
+  {
+    // Load values from LHS matrix
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
+#if M0 > 1
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
+#endif // M0 > 1
+#if M0 > 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
+#endif // M0 > 2
+#if M0 > 3
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
+#endif // M0 > 3
+#if M0 > 4
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
+#endif // M0 > 4
+#if M0 > 5
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
+#endif // M0 > 5
+#if M0 > 6
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
+#endif // M0 > 6
+#if M0 > 7
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
+#endif // M0 > 7
+
+    VEC_DATA_TYPE(DATA_TYPE, N0)
+    b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
+    RHS_VFMA_M0xN0(0, a, b, c);
+
+    lhs_offset += sizeof(DATA_TYPE);
+    rhs_offset += rhs_stride_y;
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+                              (get_global_id(1) * (uint)M0 * bias_stride_y) +
+                              get_global_id(2) * bias_stride_z;
+
+  LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+  STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
+
+#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
+/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between
+ * matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif // defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+  int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+  int z = get_global_id(2);
+
+  // Offset
+  const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+  const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
+
+  // src_addr_a = address of matrix A
+  // src_addr_b = address of matrix B
+  int src0_addr_in_bytes =
+    z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+  int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+  __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
+
+  // Compute end row address for matrix B
+  __global float *src_end_addr_b = src_addr_b + COLS_B;
+
+  src_addr_a += offset_row_a;
+  src_addr_b += offset_row_b;
+
+  // Reset accumulators
+  float4 c0 = 0.0f;
+  float4 c1 = 0.0f;
+  float4 c2 = 0.0f;
+  float4 c3 = 0.0f;
+
+  for (; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH));
+       src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = vload4(0, src_addr_a);
+    float4 b0 = vload4(0, src_addr_b);
+
+    c0 += (float4)a0.s0 * b0;
+    c1 += (float4)a0.s1 * b0;
+    c2 += (float4)a0.s2 * b0;
+    c3 += (float4)a0.s3 * b0;
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+    b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
+
+    c0 += (float4)a0.s0 * b0;
+    c1 += (float4)a0.s1 * b0;
+    c2 += (float4)a0.s2 * b0;
+    c3 += (float4)a0.s3 * b0;
+  }
+
+  for (; src_addr_b < src_end_addr_b;
+       src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = vload4(0, src_addr_a);
+    float4 b0 = vload4(0, src_addr_b);
+
+    c0 += (float4)a0.s0 * b0;
+    c1 += (float4)a0.s1 * b0;
+    c2 += (float4)a0.s2 * b0;
+    c3 += (float4)a0.s3 * b0;
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+  LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) +
+    (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(4, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store 4x4 block
+  vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+  vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+  vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+  vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between
+ * matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
+                                                         IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                         IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                         IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                         uint src1_stride_z,
+#if defined(BETA)
+                                                         uint src2_stride_z,
+#endif // defined(BETA)
+                                                         uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                         ,
+                                                         uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+  int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+  int z = get_global_id(2);
+
+  // Offset
+  const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+  const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
+
+  // src_addr_a = address of matrix A
+  // src_addr_b = address of matrix B
+  int src0_addr_in_bytes =
+    z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+  int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+  __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
+
+  src_addr_a += offset_row_a;
+  src_addr_b += offset_row_b;
+
+  // Reset accumulators
+  float4 c0 = 0.0f;
+  float4 c1 = 0.0f;
+  float4 c2 = 0.0f;
+  float4 c3 = 0.0f;
+
+#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))
+
+  int i = 0;
+  for (; i <= (int)(COLS_MTX_B - 4); i += 4)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = vload4(0, src_addr_a);
+    float4 b0 = vload4(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+    c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+    c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+    c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+    c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+    c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+    c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+    c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+    c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+    c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+    c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+    c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+    c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+    c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+    c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+    c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload4(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+    c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+    c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+    c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+    c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+    c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+    c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+    c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+    c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+    c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+    c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+    c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+    c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+    c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+    c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+    c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload4(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+    c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+    c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+    c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+    c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+    c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+    c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+    c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+    c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+    c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+    c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+    c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+    c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+    c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+    c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+    c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload4(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+    c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+    c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+    c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+    c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+    c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+    c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+    c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+    c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+    c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+    c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+    c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+    c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+    c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+    c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+    c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+  }
+
+  for (; i < (int)(COLS_MTX_B); ++i)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = vload4(0, src_addr_a);
+    float4 b0 = vload4(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+    c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+    c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+    c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+    c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+    c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+    c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+    c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+    c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+    c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+    c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+    c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+    c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+    c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+    c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+    c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+  LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) +
+    (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(4, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store 4x4 block
+  vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+  vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+  vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+  vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+// Undefine local defines
+#undef COLS_MTX_B
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and
+ * matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif // defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+  int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+  int z = get_global_id(2);
+
+  // Offset
+  const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+  const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+  // src_addr_a = address of matrix A
+  // src_addr_b = address of matrix B
+  int src0_addr_in_bytes =
+    z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+  int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+  __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+  // Compute end row address for matrix B
+  __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+  src_addr_a += offset_row_a;
+  src_addr_b += offset_row_b;
+
+  // Reset accumulators
+  half8 c0 = 0.0f;
+  half8 c1 = 0.0f;
+  half8 c2 = 0.0f;
+  half8 c3 = 0.0f;
+
+  for (; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH));
+       src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    half4 a0 = vload4(0, src_addr_a);
+    half8 b0 = vload8(0, src_addr_b);
+
+    c0 += (half8)a0.s0 * b0;
+    c1 += (half8)a0.s1 * b0;
+    c2 += (half8)a0.s2 * b0;
+    c3 += (half8)a0.s3 * b0;
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+    b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
+
+    c0 += (half8)a0.s0 * b0;
+    c1 += (half8)a0.s1 * b0;
+    c2 += (half8)a0.s2 * b0;
+    c3 += (half8)a0.s3 * b0;
+  }
+
+  for (; src_addr_b < src_end_addr_b;
+       src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    half4 a0 = vload4(0, src_addr_a);
+    half8 b0 = vload8(0, src_addr_b);
+
+    c0 += (half8)a0.s0 * b0;
+    c1 += (half8)a0.s1 * b0;
+    c2 += (half8)a0.s2 * b0;
+    c3 += (half8)a0.s3 * b0;
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(4, half, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+  LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+    (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(4, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store 4x8 block
+  vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+  vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+  vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+  vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and
+ * matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
+                                                       IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                       IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                       IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                       uint src1_stride_z,
+#if defined(BETA)
+                                                       uint src2_stride_z,
+#endif // defined(BETA)
+                                                       uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                       ,
+                                                       uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+  int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+  int z = get_global_id(2);
+
+  // Offset
+  const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+  const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+  // src_addr_a = address of matrix A
+  // src_addr_b = address of matrix B
+  int src0_addr_in_bytes =
+    z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+  int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+  __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+  // Compute end row address for matrix B
+  __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+  src_addr_a += offset_row_a;
+  src_addr_b += offset_row_b;
+
+  // Reset accumulators
+  float8 c0 = 0.0f;
+  float8 c1 = 0.0f;
+  float8 c2 = 0.0f;
+  float8 c3 = 0.0f;
+
+  for (; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH));
+       src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = convert_float4(vload4(0, src_addr_a));
+    float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+    c0 += (float8)a0.s0 * b0;
+    c1 += (float8)a0.s1 * b0;
+    c2 += (float8)a0.s2 * b0;
+    c3 += (float8)a0.s3 * b0;
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
+    b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));
+
+    c0 += (float8)a0.s0 * b0;
+    c1 += (float8)a0.s1 * b0;
+    c2 += (float8)a0.s2 * b0;
+    c3 += (float8)a0.s3 * b0;
+  }
+
+  for (; src_addr_b < src_end_addr_b;
+       src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = convert_float4(vload4(0, src_addr_a));
+    float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+    c0 += (float8)a0.s0 * b0;
+    c1 += (float8)a0.s1 * b0;
+    c2 += (float8)a0.s2 * b0;
+    c3 += (float8)a0.s3 * b0;
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+  LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+  float8 bias_f0 = convert_float8(bias0);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(4, c, bias_f0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+    (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+  float8 bias_f0 = convert_float8(bias0);
+  float8 bias_f1 = convert_float8(bias1);
+  float8 bias_f2 = convert_float8(bias2);
+  float8 bias_f3 = convert_float8(bias3);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(4, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(4, c, bias_f);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+  half8 c_h0 = convert_half8(c0);
+  half8 c_h1 = convert_half8(c1);
+  half8 c_h2 = convert_half8(c2);
+  half8 c_h3 = convert_half8(c3);
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c_h, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store 4x8 block
+  vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+  vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+  vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+  vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication
+ * between matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
+                                                         IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                         IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                         IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                         uint src1_stride_z,
+#if defined(BETA)
+                                                         uint src2_stride_z,
+#endif // defined(BETA)
+                                                         uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                         ,
+                                                         uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+  int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+  int z = get_global_id(2);
+
+  // Offset
+  const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+  const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+  // src_addr_a = address of matrix A
+  // src_addr_b = address of matrix B
+  int src0_addr_in_bytes =
+    z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+  int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+  __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+  // Compute end row address for matrix B
+  __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+  src_addr_a += offset_row_a;
+  src_addr_b += offset_row_b;
+
+  // Reset accumulators
+  half8 c0 = 0.0f;
+  half8 c1 = 0.0f;
+  half8 c2 = 0.0f;
+  half8 c3 = 0.0f;
+
+#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))
+
+  int i = 0;
+  for (; i <= (int)(COLS_MTX_B - 4); i += 4)
+  {
+#if MULT_INTERLEAVE4X4_HEIGHT == 1
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    half8 a0 = vload8(0, src_addr_a);
+    half8 b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+
+    // Load values from matrix B (transposed)
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s4, b0, c0);
+    c1 = fma((half8)a0.s5, b0, c1);
+    c2 = fma((half8)a0.s6, b0, c2);
+    c3 = fma((half8)a0.s7, b0, c3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload8(0, src_addr_a);
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+
+    // Load values from matrix B (transposed)
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s4, b0, c0);
+    c1 = fma((half8)a0.s5, b0, c1);
+    c2 = fma((half8)a0.s6, b0, c2);
+    c3 = fma((half8)a0.s7, b0, c3);
+#else  // MULT_INTERLEAVE4X4_HEIGHT == 1
+       // Load values from matrix A (interleaved) and matrix B (transposed)
+    half4 a0 = vload4(0, src_addr_a);
+    half8 b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
+  }
+
+  for (; i < (int)(COLS_MTX_B); ++i)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    half4 a0 = vload4(0, src_addr_a);
+    half8 b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(4, half, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+  LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+    (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(4, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store 4x8 block
+  vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+  vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+  vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+  vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+// Undefine local defines
+#undef COLS_MTX_B
+
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
+
+#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && \
+  (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+#if defined(DATA_TYPE)
+#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not been reshaped.
+ *
+ * @note This OpenCL kernel works with floating point data types (F16/F32)
+ * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g.
+ * -DDATA_TYPE=float)
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
+ * @note The number of matrix A columns and the optional alpha's value need to be passed at compile
+ * time using -DCOLS_A and -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16/F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                     IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                     IMAGE_DECLARATION(dst), uint src0_stride_z, uint src1_stride_z,
+#if defined(BETA)
+                                     uint src2_stride_z,
+#endif // defined(BETA)
+                                     uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                     ,
+                                     uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                     ,
+                                     uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+  // Compute starting address for matrix A and Matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+  // Update address for the matrix A
+  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+  // Update address for the matrix B
+  src_addr.s1 += idx * sizeof(DATA_TYPE);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+              (uint4)HEIGHT_GEMM3D;
+  zin = min(DEPTH_GEMM3D - 1, zin);
+
+  // Add offset due to the cross plane paddings
+  zin *= (src_cross_plane_pad * src0_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src0_stride_z by DEPTH_GEMM3D
+  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));
+
+  VECTOR_TYPE acc0 = 0.0f;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  VECTOR_TYPE acc1 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  VECTOR_TYPE acc2 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  VECTOR_TYPE acc3 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+  for (; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE));
+       src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0,
+               src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+      // Load values from matrix A
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    VECTOR_TYPE b0 =
+      VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+    VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(
+      0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+    // Accumulate
+    acc0 += b0 * (VECTOR_TYPE)a0.s0;
+    acc0 += b1 * (VECTOR_TYPE)a0.s1;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 += b0 * (VECTOR_TYPE)a1.s0;
+    acc1 += b1 * (VECTOR_TYPE)a1.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 += b0 * (VECTOR_TYPE)a2.s0;
+    acc2 += b1 * (VECTOR_TYPE)a2.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 += b0 * (VECTOR_TYPE)a3.s0;
+    acc3 += b1 * (VECTOR_TYPE)a3.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  }
+
+  for (; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    VECTOR_TYPE b0 =
+      VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+
+    // Accumulate
+    acc0 += b0 * (VECTOR_TYPE)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 += b0 * (VECTOR_TYPE)a1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 += b0 * (VECTOR_TYPE)a2;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 += b0 * (VECTOR_TYPE)a3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  }
+
+  int z = get_global_id(2);
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+         (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (dst_cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes +
+    (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y,
+             zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes +
+    (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) +
+    (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+    get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias,
+             src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+  STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc,
+              dst_addr, dst_stride_y, zout.s);
+}
+#endif // defined(DATA_TYPE)
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not been reshaped
+ *
+ * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma
+ * units.
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel
+ * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif // defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                 ,
+                                                 uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+  // Compute starting address for matrix A and matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+  // Update address for matrix A
+  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+  // Update address for matrix B
+  src_addr.s1 += idx * sizeof(float);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+              (uint4)HEIGHT_GEMM3D;
+  zin = min(DEPTH_GEMM3D - 1, zin);
+
+  // Add offset due to the cross plane paddings
+  zin *= (src_cross_plane_pad * src0_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src0_stride_z by DEPTH_GEMM3D
+  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  // Initialize accumulators
+  float4 acc0 = 0.0f;
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  float4 acc1 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  float4 acc2 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  float4 acc3 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+  // A and B src indices get incremented at the same time.
+  int i = 0;
+  for (; i <= ((int)COLS_A - 4); i += 4)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A and matrix B
+    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y,
+               zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+      // Load values from matrix A and matrix B
+    float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
+    acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);
+    acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+    acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);
+    acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);
+    acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+    acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);
+    acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);
+    acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);
+    acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);
+    acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    // Load values from matrix A and matrix B
+    b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);
+    acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);
+    acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+    acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);
+    acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);
+    acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+    acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);
+    acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);
+    acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);
+    acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);
+    acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    // Load values from matrix A and matrix B
+    b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);
+    acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);
+    acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+    acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);
+    acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);
+    acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+    acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);
+    acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);
+    acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);
+    acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);
+    acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    // Load values from matrix A and matrix B
+    b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);
+    acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);
+    acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+    acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);
+    acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);
+    acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+    acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);
+    acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);
+    acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);
+    acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);
+    acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += 4 * sizeof(float);
+  }
+
+  for (; i < (int)COLS_A; ++i)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0, b0.s1, acc0.s1);
+    acc0.s2 = fma(a0, b0.s2, acc0.s2);
+    acc0.s3 = fma(a0, b0.s3, acc0.s3);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1.s0 = fma(a1, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1, b0.s1, acc1.s1);
+    acc1.s2 = fma(a1, b0.s2, acc1.s2);
+    acc1.s3 = fma(a1, b0.s3, acc1.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2.s0 = fma(a2, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2, b0.s1, acc2.s1);
+    acc2.s2 = fma(a2, b0.s2, acc2.s2);
+    acc2.s3 = fma(a2, b0.s3, acc2.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3.s0 = fma(a3, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3, b0.s1, acc3.s1);
+    acc3.s2 = fma(a3, b0.s2, acc3.s2);
+    acc3.s3 = fma(a3, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += sizeof(float);
+  }
+
+  int z = get_global_id(2);
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+         (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (dst_cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+  LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) +
+    (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+    get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias
+  ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store the output block
+  vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not been reshaped
+ *
+ * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma
+ * units. This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or
+ * equal to 1000.
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel
+ * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if
+ * alpha!=1.0f.
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
+                                                      IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                      IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                      IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                      uint src1_stride_z,
+#if defined(BETA)
+                                                      uint src2_stride_z,
+#endif // defined(BETA)
+                                                      uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                      ,
+                                                      uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                      ,
+                                                      uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for
+  // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+  // Compute starting address for matrix A and Matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+  // Update address for the matrix A
+  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+  // Update address for the matrix B
+  src_addr.s1 += idx * sizeof(float);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+              (uint4)HEIGHT_GEMM3D;
+  zin = min(DEPTH_GEMM3D - 1, zin);
+
+  // Add offset due to the cross plane paddings
+  zin *= (src_cross_plane_pad * src0_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src0_stride_z by DEPTH_GEMM3D
+  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  // Initialize accumulators
+  float2 acc0 = 0.0f;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  float2 acc1 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  float2 acc2 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  float2 acc3 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+  // A and B src indices get incremented at the same time.
+  int i = 0;
+  for (; i <= ((int)COLS_A - 8); i += 8)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
+    acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);
+    acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);
+    acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);
+    acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);
+    acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);
+    acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);
+    acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);
+
+    acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
+    acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);
+    acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);
+    acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);
+    acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);
+    acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);
+    acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);
+    acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+    acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);
+    acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);
+    acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);
+    acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);
+    acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);
+    acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);
+    acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);
+    acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);
+
+    acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);
+    acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);
+    acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);
+    acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);
+    acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);
+    acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);
+    acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);
+    acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+    acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);
+    acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);
+    acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);
+    acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);
+    acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);
+    acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);
+    acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);
+    acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);
+
+    acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);
+    acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);
+    acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);
+    acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);
+    acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);
+    acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);
+    acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);
+    acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+    acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);
+    acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);
+    acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);
+    acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);
+    acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);
+    acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);
+    acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);
+    acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);
+
+    acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);
+    acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);
+    acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);
+    acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);
+    acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);
+    acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);
+    acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);
+    acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += sizeof(float) * 8;
+  }
+  // float size increment
+  for (; i < (int)COLS_A; ++i)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0, b0.s1, acc0.s1);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1.s0 = fma(a1, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1, b0.s1, acc1.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2.s0 = fma(a2, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2, b0.s1, acc2.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3.s0 = fma(a3, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3, b0.s1, acc3.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += sizeof(float);
+  }
+
+  int z = get_global_id(2);
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+         (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (dst_cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));
+
+  LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) +
+    (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+    get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias
+  ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store the output block
+  vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating
+ * the result in a 32 floating point variable.
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel
+ * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
+                                                       IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                       IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                       IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                       uint src1_stride_z,
+#if defined(BETA)
+                                                       uint src2_stride_z,
+#endif // defined(BETA)
+                                                       uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                       ,
+                                                       uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                       ,
+                                                       uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+  // Compute starting address for matrix A and Matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+  // Update address for the matrix A
+  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+  // Update address for the matrix B
+  src_addr.s1 += idx * sizeof(half);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+              (uint4)HEIGHT_GEMM3D;
+  zin = min(DEPTH_GEMM3D - 1, zin);
+
+  // Add offset due to the cross plane paddings
+  zin *= (src_cross_plane_pad * src0_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src0_stride_z by DEPTH_GEMM3D
+  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  float8 acc0 = 0.0h;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  float8 acc1 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  float8 acc2 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  float8 acc3 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+  int i = 0;
+  for (; i <= ((int)COLS_A - 4); i += 4)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y,
+               zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+      // Load values from matrix A
+    half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+    src_addr.s1 += src1_stride_y;
+
+    // Accumulate
+    acc0 = fma(b0, (float8)a0.s0, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (float8)a1.s0, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (float8)a2.s0, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (float8)a3.s0, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (float8)a0.s1, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (float8)a1.s1, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (float8)a2.s1, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (float8)a3.s1, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (float8)a0.s2, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (float8)a1.s2, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (float8)a2.s2, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (float8)a3.s2, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (float8)a0.s3, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (float8)a1.s3, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (float8)a2.s3, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (float8)a3.s3, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += 4 * sizeof(half);
+  }
+
+  for (; i < (int)COLS_A; ++i)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+
+    src_addr += (int2)(sizeof(half), src1_stride_y);
+
+    // Accumulate
+    acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;
+#endif                                // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;
+#endif                                // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;
+#endif                                // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  }
+
+  int z = get_global_id(2);
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+         (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (dst_cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+  LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+  float8 bias_f0 = convert_float8(bias0);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+    (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+    get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+  float8 bias_f0 = convert_float8(bias0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  float8 bias_f1 = convert_float8(bias1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  float8 bias_f2 = convert_float8(bias2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  float8 bias_f3 = convert_float8(bias3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias
+  ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+  half8 acc_h0 = convert_half8(acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  half8 acc_h1 = convert_half8(acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  half8 acc_h2 = convert_half8(acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  half8 acc_h3 = convert_half8(acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc_h, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store the output block
+  STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s);
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma
+ * units.
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel
+ * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif // defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                 ,
+                                                 uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+  // Compute starting address for matrix A and Matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+  // Update address for the matrix A
+  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+  // Update address for the matrix B
+  src_addr.s1 += idx * sizeof(half);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+              (uint4)HEIGHT_GEMM3D;
+  zin = min(DEPTH_GEMM3D - 1, zin);
+
+  // Add offset due to the cross plane paddings
+  zin *= (src_cross_plane_pad * src0_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src0_stride_z by DEPTH_GEMM3D
+  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  half8 acc0 = 0.0h;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  half8 acc1 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  half8 acc2 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  half8 acc3 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+  int i = 0;
+  for (; i <= ((int)COLS_A - 4); i += 4)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y,
+               zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+      // Load values from matrix A
+    half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Accumulate
+    acc0 = fma(b0, (half8)a0.s0, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (half8)a1.s0, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (half8)a2.s0, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (half8)a3.s0, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (half8)a0.s1, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (half8)a1.s1, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (half8)a2.s1, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (half8)a3.s1, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (half8)a0.s2, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (half8)a1.s2, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (half8)a2.s2, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (half8)a3.s2, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (half8)a0.s3, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (half8)a1.s3, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (half8)a2.s3, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (half8)a3.s3, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += 4 * sizeof(half);
+  }
+
+  for (; i < (int)COLS_A; ++i)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+
+    src_addr += (int2)(sizeof(half), src1_stride_y);
+
+    // Accumulate
+    acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;
+#endif                               // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;
+#endif                               // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;
+#endif                               // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  }
+
+  int z = get_global_id(2);
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+         (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (dst_cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+  LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+    (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+    get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias
+  ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store the output block
+  STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s);
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) &&
+       // (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+
+#if defined(BETA)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account
+ * that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types:
+ * F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+  Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+  // Load values from A x B
+  float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
+
+  // Load values from Matrix C
+  float4 c = vload4(0, (__global float *)src.ptr);
+
+  // Computes alpha * axb + beta * c
+  float4 out = alpha_ab + (float4)BETA * c;
+
+  // Store final result in axb matrix
+  vstore4(out, 0, (__global float *)dst.ptr);
+}
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account
+ * that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types:
+ * F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+  Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+  // Load values from A x B
+  half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
+
+  // Load values from Matrix C
+  half8 c = vload8(0, (__global half *)src.ptr);
+
+  // Computes alpha * axb + beta * c
+  half8 out = alpha_ab + (half8)BETA * c;
+
+  // Store final result in axb matrix
+  vstore8(out, 0, (__global half *)dst.ptr);
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#endif // defined(BETA)
+
+#if defined(WIDTH_VECTOR_A)
+/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and
+ * matrix B (src1) used for locally connected layer
+ *
+ * @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
+ *
+ * @note The input A and matrix B must not be reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0), TENSOR3D_DECLARATION(src1),
+                             IMAGE_DECLARATION(dst))
+{
+  int idx = get_global_id(0) * 4;
+  int idy = get_global_id(1);
+
+  // Compute the address for the vector A and matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy,
+                          src1_offset_first_element_in_bytes + src1_stride_z * idy));
+  src_addr.s1 += idx * sizeof(float);
+
+  int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
+
+  float4 acc = 0.0f;
+
+  for (; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float));
+       src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+  {
+    float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
+    float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+    acc += b0 * (float4)a0.s0;
+    acc += b1 * (float4)a0.s1;
+  }
+
+  for (; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
+  {
+    float a0 = *((__global float *)(src0_ptr + src_addr.s0));
+    float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+
+    acc += b0 * (float4)a0;
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
+}
+#endif // defined(WIDTH_VECTOR_A)
+
+/** This kernel accumulates each row with the biases vector.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short.
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16.
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported
+ * data type: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X
+ * dimension (in bytes)
+ * @param[in]      accum_step_x                         accum_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y
+ * dimension (in bytes)
+ * @param[in]      accum_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the
+ * accumulate tensor
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p
+ * accum_ptr
+ * @param[in]      biases_stride_x                      Stride of the destination tensor in X
+ * dimension (in bytes)
+ * @param[in]      biases_step_x                        dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ */
+#if defined(DATA_TYPE) && defined(VECTOR_SIZE)
+__kernel void gemm_accumulate_biases(IMAGE_DECLARATION(accum), VECTOR_DECLARATION(biases))
+{
+  Image accum = CONVERT_TO_IMAGE_STRUCT(accum);
+  Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+  // Vector size, e.g. number of vector elements.
+  VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+  accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr);
+  VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+  biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
+  accum_value = biases_value + accum_value;
+  // Store result in the accumulate buffer
+  VSTORE(VECTOR_SIZE)
+  (accum_value, 0, (__global DATA_TYPE *)accum.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(VECTOR_SIZE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h
new file mode 100644
index 000000000..0c75d061f
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h
@@ -0,0 +1,1235 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+
+/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
+ * @name LOAD_ROW_n
+ *
+ * @param[in] N0        The number of rows to load
+ * @param[in] DATA_TYPE The data type of variables
+ * @param[in] BASENAME  The basename of the destination variables for the loaded rows
+ * @param[in] PTR       The base pointer
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The z-axis offset vector
+ * @{
+ */
+#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
+
+#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
+
+#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
+
+#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
+
+#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
+
+#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
+
+#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
+
+#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
+
+#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
+
+#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)        \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
+
+#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
+
+#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
+
+#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
+
+#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
+
+#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
+
+#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
+
+/** @}*/ // end of group LOAD_ROW_n
+
+/** Load Blocks (consecutive rows and columns) with Z offset.
+ * @name LOAD_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of consecutive rows
+ * @param[in] N0        The number of consecutive columns
+ * @param[in] DATA_TYPE The data type of the target
+ * @param[in] BASENAME  The basename of the result variables
+ * @param[in] PTR       The base pointer for the data
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride in y-axis direction
+ * @param[in] Z         The z-axis offset vector
+ * @{
+ */
+#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+/** @} */ // end of group LOAD_BLOCK
+
+/** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
+ * @name LOAD_ELEMENT_n
+ *
+ * @param[in] N0        The number of rows to load
+ * @param[in] DATA_TYPE The data type of variables
+ * @param[in] BASENAME  The basename of the destination variables for the loaded rows
+ * @param[in] PTR       The base pointer
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @{
+ */
+#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y));
+
+#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y));
+
+#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y));
+
+#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y));
+
+#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y));
+
+#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y));
+
+#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y));
+
+#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y));
+
+#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y));
+
+#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)        \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y));
+
+#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y));
+
+#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y));
+
+#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y));
+
+#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y));
+
+#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y));
+
+#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y));
+
+/** @}*/ // end of group LOAD_ELEMENT_n
+
+/** Load Scalar as Vector (consecutive elements).
+ * @name LOAD_SCALAR_AS_VECTOR
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
+ *
+ * @param[in] M0        The number of consecutive rows
+ * @param[in] N0        The number of consecutive columns
+ * @param[in] DATA_TYPE The data type of the target
+ * @param[in] BASENAME  The basename of the result variables
+ * @param[in] PTR       The base pointer for the data
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride in y-axis direction
+ * @{
+ */
+#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+/** @} */ // end of group LOAD_SCALAR_AS_VECTOR
+
+/** Basic macros to calculate Z offset values from Z0 to Zn-1
+ * @name CALCULATE_Z_OFFSET_n
+ *
+ * @param[in] M0              The number of offset values to calculate
+ * @param[in] DATA_TYPE       The data type of the results
+ * @param[in] Z               The basename of the result variables
+ * @param[in] Y               The work-itme ID of y-axis
+ * @param[in] HEIGHT_GEMM3D   The height of GEMM3D
+ * @param[in] DEPTH_GEMM3D    The depth of GEMM3D
+ * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
+ * @param[in] STRIDE_Y        The stride value in y-axis direction
+ *
+ * @{
+ */
+#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  Z##0 = (0 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                              \
+  Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##1 = (1 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                              \
+  Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##2 = (2 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                              \
+  Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##3 = (3 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                              \
+  Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##4 = (4 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                              \
+  Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##5 = (5 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                              \
+  Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##6 = (6 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                              \
+  Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##7 = (7 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                              \
+  Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+/** @} */ // end of group CALCULATE_Z_OFFSET_n
+
+/** Calculate Z offset values from Z0 to Zn-1
+ * @name CALCULATE_Z_OFFSET
+ *
+ * The Z offsets are expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected names of Z offsets are zin1, zin2, zin3.
+ * Note that, CROSS_PLANE_PAD (cross plain padding) is required to take into account
+ * the possible cross plane paddings in case of the plance changes across the z-dimension.
+ *
+ * <!--
+ * |                  |
+ * |      plane0      |
+ * |                  |
+ * |__________________|
+ * |******************|
+ * |  cross_plane_pad |
+ * |******************|
+ * |                  |
+ * |      plane1      |
+ * |                  |
+ * |__________________|
+ * -->
+ *
+ * @param[in] M0              The number of offset values to calculate
+ * @param[in] DATA_TYPE       The data type of the results
+ * @param[in] Z               The basename of the result variables
+ * @param[in] Y               The work-itme ID of y-axis
+ * @param[in] HEIGHT_GEMM3D   The height of GEMM3D
+ * @param[in] DEPTH_GEMM3D    The depth of GEMM3D
+ * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
+ * @param[in] STRIDE_Y        The stride value in y-axis direction
+ * @{
+ */
+#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                               STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,      \
+                          STRIDE_Y)
+#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                           STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,   \
+                         STRIDE_Y)
+/** @} */ // end of group CALCULATE_Z_OFFSET
+
+/** Store the 0 to (n-1)th rows of the given variables
+ * @name STORE_ROW_n
+ *
+ * @param[in] N0        The size of the vectors
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  VSTORE(N0)                                                   \
+  (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)        \
+  VSTORE(N0)                                                    \
+  (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+/** @} */ // end of groupd STORE_ROW_n
+
+/** Convert and store the 0th to (n-1)th rows of the given variables
+ * @name CONVERT_STORE_ROW_n
+ *
+ * @param[in] N0        The size of the vectors
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)   \
+  VSTORE(N0)                                                       \
+  (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,    \
+   (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+
+/** @} */ // end of groupd CONVERT_STORE_ROW_n
+
+/** Store a block of the given size M0xN0
+ * @name STORE_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of rows to store
+ * @param[in] N0        The size of each vector
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @} */ // end of group STORE_BLOCK
+
+/** Convert and store a block of the given size M0xN0
+ * @name CONVERT_STORE_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of rows to store
+ * @param[in] N0        The size of each vector
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @} */ // end of group CONVERT_STORE_BLOCK
+
+/** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
+ * @name SCALE_ROW_n
+ *
+ * @param[in] DATA_TYPE The data type of the variables
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] SCALE     The scale factor
+ * @{
+ */
+#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) BASENAME##0 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##1 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##2 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##3 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##4 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##5 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##6 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##7 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##8 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE)        \
+  BASENAME##9 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##A *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##B *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##C *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##D *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##E *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##F *= (DATA_TYPE)SCALE;
+/** @} */ // end of group SCALE_ROW_n
+
+/** Scale elements stored in a block (BASENAME)
+ * @name SCALE_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16
+ *
+ * @param[in] N         The number of rows in the block
+ * @param[in] DATA_TYPE The data type of the block
+ * @param[in] BASENAME  The basename of the block
+ * @param[in] SCALE     The scale factor
+ * @{
+ */
+#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
+#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
+/** @} */ // end of group SCALE_BLOCK
+
+/** Create a new vector containing the values at the given index for a set of given vectors
+ * @name COLUMN_VECTORn
+ *
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] X        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ * @{
+ */
+#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
+  TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
+#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 2)                           \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
+#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 3)                           \
+  BASENAME##IDX_COL =                              \
+    (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
+#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE)                                   \
+  VEC_DATA_TYPE(TYPE, 4)                                                             \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, \
+                                               (X##2).s##IDX_COL, (X##3).s##IDX_COL);
+#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE)                                                 \
+  VEC_DATA_TYPE(TYPE, 8)                                                                           \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))(                                                    \
+    (X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, \
+    (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
+#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE)                                                \
+  VEC_DATA_TYPE(TYPE, 16)                                                                          \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))(                                                   \
+    (X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, \
+    (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, \
+    (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, \
+    (X##F).s##IDX_COL);
+/** @} */ // end of group COLUMN_VECTORn
+
+/** Create a new vector containing the values at the given index. Utility macros for transposing a
+ * colum-vector
+ * @name COLUMN_VECTOR_SCALARn
+ *
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] X        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ * @{
+ */
+#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0));
+#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 2)                                  \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
+#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 3)                                  \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2));
+#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 4)                                  \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3));
+#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 8)                                  \
+  BASENAME##IDX_COL =                                     \
+    (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
+#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE)                                    \
+  VEC_DATA_TYPE(TYPE, 16)                                                                     \
+  BASENAME##IDX_COL =                                                                         \
+    (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), \
+                              (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
+/** @} */ // end of group COLUMN_VECTORn
+
+/** Create transposed vectors of the given vectors
+ * @name TRANSPOSE_K0Xn
+ *
+ * @param[in] K0       The size of the source vectors
+ * @param[in] BASENAME The basename of transposed vectors
+ * @param[in] B        The basename of source vectors for transposition
+ * @param[in] TYPE     The data type of the transposed vectors
+ * @{
+ */
+#define TRANSPOSE_K0X1(K0, BASENAME, B, TYPE) COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X2(K0, BASENAME, B, TYPE) \
+  COLUMN_VECTOR(K0, 0, BASENAME, B, TYPE);    \
+  COLUMN_VECTOR(K0, 1, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X3(K0, BASENAME, B, TYPE) \
+  TRANSPOSE_K0X2(K0, BASENAME, B, TYPE);      \
+  COLUMN_VECTOR(K0, 2, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X4(K0, BASENAME, B, TYPE) \
+  TRANSPOSE_K0X3(K0, BASENAME, B, TYPE);      \
+  COLUMN_VECTOR(K0, 3, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X8(K0, BASENAME, B, TYPE) \
+  TRANSPOSE_K0X4(K0, BASENAME, B, TYPE);      \
+  COLUMN_VECTOR(K0, 4, BASENAME, B, TYPE);    \
+  COLUMN_VECTOR(K0, 5, BASENAME, B, TYPE);    \
+  COLUMN_VECTOR(K0, 6, BASENAME, B, TYPE);    \
+  COLUMN_VECTOR(K0, 7, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X16(K0, BASENAME, B, TYPE) \
+  TRANSPOSE_K0X8(K0, BASENAME, B, TYPE);       \
+  COLUMN_VECTOR(K0, 8, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, 9, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, A, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, B, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, C, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, D, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, E, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, F, BASENAME, B, TYPE);
+
+/** @} */ // end of group TRANSPOSE_K0Xn
+
+/** Create column vectors to contain the values at the given index for a set of given vectors
+ *
+ * @param[in] K0       The number of source vectors
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] B        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ */
+#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, B, TYPE) \
+  CONCAT(COLUMN_VECTOR, K0)                           \
+  (IDX_COL, BASENAME, B, TYPE);
+
+/** Create column vectors to contain the values at the given index. Utility macro for transposing a
+ * column-vector
+ *
+ * @param[in] K0       The number of source vectors
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] B        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ */
+#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, B, TYPE) \
+  CONCAT(COLUMN_VECTOR_SCALAR, K0)                           \
+  (IDX_COL, BASENAME, B, TYPE);
+
+/** Create transposed vectors form the given source vectors
+ *
+ * @param[in] K0       The size of source vectors
+ * @param[in] N0       The number of source vectors
+ * @param[in] BASENAME The basename of transposed vectors
+ * @param[in] B        The basename of source vectors for transposition
+ * @param[in] TYPE     The data type of the transposed vectors
+ *
+ */
+#define TRANSPOSE_K0XN0(K0, N0, BASENAME, B, TYPE) \
+  CONCAT(TRANSPOSE_K0X, N0)                        \
+  (K0, BASENAME, B, TYPE);
+
+/** Add the variables (BIAS0 to BIASn-1) to the others (BASENAME0 to BASENAMEn-1)
+ * @name ADD_ROW_n
+ *
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The basename of the added variables
+ * @{
+ */
+#define ADD_ROW_1(BASENAME, BIAS) BASENAME##0 += BIAS##0;
+
+#define ADD_ROW_2(BASENAME, BIAS) \
+  ADD_ROW_1(BASENAME, BIAS)       \
+  BASENAME##1 += BIAS##1;
+
+#define ADD_ROW_3(BASENAME, BIAS) \
+  ADD_ROW_2(BASENAME, BIAS)       \
+  BASENAME##2 += BIAS##2;
+
+#define ADD_ROW_4(BASENAME, BIAS) \
+  ADD_ROW_3(BASENAME, BIAS)       \
+  BASENAME##3 += BIAS##3;
+
+#define ADD_ROW_5(BASENAME, BIAS) \
+  ADD_ROW_4(BASENAME, BIAS)       \
+  BASENAME##4 += BIAS##4;
+
+#define ADD_ROW_6(BASENAME, BIAS) \
+  ADD_ROW_5(BASENAME, BIAS)       \
+  BASENAME##5 += BIAS##5;
+
+#define ADD_ROW_7(BASENAME, BIAS) \
+  ADD_ROW_6(BASENAME, BIAS)       \
+  BASENAME##6 += BIAS##6;
+
+#define ADD_ROW_8(BASENAME, BIAS) \
+  ADD_ROW_7(BASENAME, BIAS)       \
+  BASENAME##7 += BIAS##7;
+
+#define ADD_ROW_9(BASENAME, BIAS) \
+  ADD_ROW_8(BASENAME, BIAS)       \
+  BASENAME##8 += BIAS##8;
+
+#define ADD_ROW_10(BASENAME, BIAS) \
+  ADD_ROW_9(BASENAME, BIAS)        \
+  BASENAME##9 += BIAS##9;
+
+#define ADD_ROW_11(BASENAME, BIAS) \
+  ADD_ROW_10(BASENAME, BIAS)       \
+  BASENAME##A += BIAS##A;
+
+#define ADD_ROW_12(BASENAME, BIAS) \
+  ADD_ROW_11(BASENAME, BIAS)       \
+  BASENAME##B += BIAS##B;
+
+#define ADD_ROW_13(BASENAME, BIAS) \
+  ADD_ROW_12(BASENAME, BIAS)       \
+  BASENAME##C += BIAS##C;
+
+#define ADD_ROW_14(BASENAME, BIAS) \
+  ADD_ROW_13(BASENAME, BIAS)       \
+  BASENAME##D += BIAS##D;
+
+#define ADD_ROW_15(BASENAME, BIAS) \
+  ADD_ROW_14(BASENAME, BIAS)       \
+  BASENAME##E += BIAS##E;
+
+#define ADD_ROW_16(BASENAME, BIAS) \
+  ADD_ROW_15(BASENAME, BIAS)       \
+  BASENAME##F += BIAS##F;
+
+/** @} */ // end of group ADD_ROW_n
+
+/** Add the block (BIAS) to another block (BASENAME)
+ * @name ADD_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16
+ *
+ * @param[in] N        The number of vectors in the block
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The basename of the added variables
+ * @{
+ */
+#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
+#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
+/** @} */ // end of group ADD_BLOCK
+
+/** Broadcast (add single value) to the each element of the destination variables
+ * @name ADD_ROW_BROADCAST_n
+ *
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The variable containing the value to add
+ * @{
+ */
+#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) BASENAME##0 += BIAS;
+
+#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_1(BASENAME, BIAS)       \
+  BASENAME##1 += BIAS;
+
+#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_2(BASENAME, BIAS)       \
+  BASENAME##2 += BIAS;
+
+#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_3(BASENAME, BIAS)       \
+  BASENAME##3 += BIAS;
+
+#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_4(BASENAME, BIAS)       \
+  BASENAME##4 += BIAS;
+
+#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_5(BASENAME, BIAS)       \
+  BASENAME##5 += BIAS;
+
+#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_6(BASENAME, BIAS)       \
+  BASENAME##6 += BIAS;
+
+#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_7(BASENAME, BIAS)       \
+  BASENAME##7 += BIAS;
+
+#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_8(BASENAME, BIAS)       \
+  BASENAME##8 += BIAS;
+
+#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_9(BASENAME, BIAS)        \
+  BASENAME##9 += BIAS;
+
+#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_10(BASENAME, BIAS)       \
+  BASENAME##A += BIAS;
+
+#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_11(BASENAME, BIAS)       \
+  BASENAME##B += BIAS;
+
+#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_12(BASENAME, BIAS)       \
+  BASENAME##C += BIAS;
+
+#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_13(BASENAME, BIAS)       \
+  BASENAME##D += BIAS;
+
+#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_14(BASENAME, BIAS)       \
+  BASENAME##E += BIAS;
+
+#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_15(BASENAME, BIAS)       \
+  BASENAME##F += BIAS;
+
+/** Broadcast (add a value) to the each element of the destination block (BASENAME)
+ * @name ADD_BLOCK_BROADCAST
+ *
+ * Supported cases are N=1,2,3,...,16.
+ *
+ * @param[in] N        The number of vectors in the block
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The variable containing the value to add
+ * @{
+ */
+#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
+#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
+/** @} */ // end of group ADD_BLOCK_BROADCAST
+
+/** Apply activation to the given variables
+ * @name ACTIVATION_ROW_n
+ *
+ * @param[in] ACTIVATION_TYPE The type of the activation
+ * @param[in] DATA_TYPE       The data type of the vectors
+ * @param[in] BASENAME        The basename of the variables
+ * @param[in] A_VAL           Additional value required by the activation
+ * @param[in] B_VAL           Additional value required by the activation
+ * @{
+ */
+#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##0, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##1, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##2, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##3, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##4, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##5, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##6, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##7, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##8, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)        \
+  BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##9, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##A, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##B, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##C, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##D, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##E, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##F, A_VAL, B_VAL);
+/** @} */ // end of group ACTIVATION_ROW_n
+
+/** Apply activation to a block (BASENAME)
+ * @name ACTIVATION_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16.
+ *
+ * @param[in] N               The number of vectors in the block
+ * @param[in] ACTIVATION_TYPE The type of the activation
+ * @param[in] DATA_TYPE       The data type of the vectors
+ * @param[in] BASENAME        The basename of the variables
+ * @param[in] A_VAL           Additional value required by the activation
+ * @param[in] B_VAL           Additional value required by the activation
+ * @{
+ */
+#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
+/** @} */ // end of group ACTIVATION_BLOCK
+
+/** Apply convert_<data_type> to the given variables
+ * @name CONVERT_ROW_n
+ *
+ * @param[in] N            The size of the vectors
+ * @param[in] DATA_TYPE    The data type of the vectors
+ * @param[in] BASENAME_SRC The basename of the source variables
+ * @param[in] BASENAME_DST The basename of the destination variables
+ */
+#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)        \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
+/** @} */ // end of group CONVERT_ROW_n
+
+/** Apply convert_<data_type> to a block (BASENAME_SRC) and save to another block (BASENAME_DST)
+ * @name CONVERT_BLOCK
+ *
+ * Supported cases N=1,2,3,...,16.
+ *
+ * @param[in] M            The number of vectors to convert
+ * @param[in] N            The size of the vectors
+ * @param[in] DATA_TYPE    The data type of the vectors
+ * @param[in] BASENAME_SRC The basename of the source variables
+ * @param[in] BASENAME_DST The basename of the destination variables
+ */
+#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+/** @} */ // end of group CONVERT_BLOCK
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl
new file mode 100644
index 000000000..2d9acc753
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl
@@ -0,0 +1,2733 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "helpers_asymm.h"
+#include "repeat.h"
+
+#if defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
+  defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
+      // defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
+       // defined(cl_arm_integer_dot_product_accumulate_int8)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform the dot product instruction between two vectors of size N [1,16].
+ * These macros use the dot8 instruction */
+#define ARM_DOT1(a, b, c)                                                         \
+  ({                                                                              \
+    ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0),     \
+            (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \
+  })
+#define ARM_DOT2(a, b, c)                                                         \
+  ({                                                                              \
+    ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0),     \
+            (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \
+  })
+#define ARM_DOT3(a, b, c)                                       \
+  ({                                                            \
+    ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0),     \
+            (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \
+  })
+#define ARM_DOT4(a, b, c) ({ ARM_DOT(a, b, c); })
+#define ARM_DOT8(a, b, c)        \
+  ({                             \
+    ARM_DOT4((a.lo), (b.lo), c); \
+    ARM_DOT4((a.hi), (b.hi), c); \
+  })
+#define ARM_DOT16(a, b, c)       \
+  ({                             \
+    ARM_DOT8((a.lo), (b.lo), c); \
+    ARM_DOT8((a.hi), (b.hi), c); \
+  })
+
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform the dot product instruction between two vectors of size K0 [1,16]
+ * without using the dot8 instruction. */
+#define ARM_DOT1(a, b, c) ({ c += (ACC_DATA_TYPE)a * b; })
+#define ARM_DOT2(a, b, c)            \
+  ({                                 \
+    c += (ACC_DATA_TYPE)a.s0 * b.s0; \
+    c += (ACC_DATA_TYPE)a.s1 * b.s1; \
+  })
+#define ARM_DOT3(a, b, c)            \
+  ({                                 \
+    ARM_DOT2(a, b, c);               \
+    c += (ACC_DATA_TYPE)a.s2 * b.s2; \
+  })
+#define ARM_DOT4(a, b, c)            \
+  ({                                 \
+    ARM_DOT3(a, b, c);               \
+    c += (ACC_DATA_TYPE)a.s3 * b.s3; \
+  })
+#define ARM_DOT8(a, b, c)        \
+  ({                             \
+    ARM_DOT4((a.lo), (b.lo), c); \
+    ARM_DOT4((a.hi), (b.hi), c); \
+  })
+#define ARM_DOT16(a, b, c)       \
+  ({                             \
+    ARM_DOT8((a.lo), (b.lo), c); \
+    ARM_DOT8((a.hi), (b.hi), c); \
+  })
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0
+ * vectors "b" of size K0 [1,16] */
+#define ARM_DOT_K0X1(k0, a, b, c) ({ ARM_DOT_K0(k0, (a), (b##0), (c)); })
+#define ARM_DOT_K0X2(k0, a, b, c)        \
+  ({                                     \
+    ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \
+    ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \
+  })
+#define ARM_DOT_K0X3(k0, a, b, c)        \
+  ({                                     \
+    ARM_DOT_K0X2(k0, a, b, c);           \
+    ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \
+  })
+#define ARM_DOT_K0X4(k0, a, b, c)        \
+  ({                                     \
+    ARM_DOT_K0X3(k0, a, b, c);           \
+    ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \
+  })
+#define ARM_DOT_K0X8(k0, a, b, c)        \
+  ({                                     \
+    ARM_DOT_K0X4(k0, a, b, c);           \
+    ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \
+    ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \
+    ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \
+    ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \
+  })
+#define ARM_DOT_K0X16(k0, a, b, c)       \
+  ({                                     \
+    ARM_DOT_K0X8(k0, a, b, c);           \
+    ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \
+    ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \
+    ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \
+    ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \
+    ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \
+    ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \
+    ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \
+    ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \
+  })
+
+/** Specialized macros to perform a partial matrix multiplication with dimensions M0,N0,K0 */
+#define ARM_MM_K0XN0X1(n0, k0, a, b, c) ({ ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); })
+#define ARM_MM_K0XN0X2(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X1(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \
+  })
+#define ARM_MM_K0XN0X3(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X2(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \
+  })
+#define ARM_MM_K0XN0X4(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X3(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \
+  })
+#define ARM_MM_K0XN0X5(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X4(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \
+  })
+#define ARM_MM_K0XN0X6(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X5(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \
+  })
+#define ARM_MM_K0XN0X7(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X6(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \
+  })
+#define ARM_MM_K0XN0X8(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X7(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \
+  })
+
+#define ARM_DOT_K0(k0, a, b, c) \
+  ({                            \
+    CONCAT(ARM_DOT, k0)         \
+    ((a), (b), (c));            \
+  })
+
+#define ARM_DOT_K0XN0(n0, k0, a, b, c) \
+  ({                                   \
+    CONCAT(ARM_DOT_K0X, n0)            \
+    (k0, (a), b, (c));                 \
+  })
+
+#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \
+  ({                                         \
+    CONCAT(ARM_MM_K0XN0X, m0)                \
+    (n0, k0, a, b, c);                       \
+  })
+
+/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0
+ * vectors "b" of size K0 [1,16] */
+#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c) ({ c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; })
+#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c)    \
+  ({                                              \
+    c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \
+    c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \
+  })
+#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c)    \
+  ({                                              \
+    ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c);       \
+    c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \
+  })
+#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c)    \
+  ({                                              \
+    ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c);       \
+    c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \
+  })
+#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c)    \
+  ({                                              \
+    ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c);       \
+    c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \
+    c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \
+    c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \
+    c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \
+  })
+#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c)   \
+  ({                                              \
+    ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c);       \
+    c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \
+    c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \
+    c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \
+    c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \
+    c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \
+    c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \
+    c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \
+    c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \
+  })
+/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */
+#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); })
+#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7));  \
+  })
+#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                \
+    CONCAT(ARM_MUL_N0X, k0)                         \
+    (VECTOR_ACC_TYPE, (a), b, (c));                 \
+  })
+#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \
+  ({                                                             \
+    CONCAT(ARM_MM_NATIVE_N0XK0X, m0)                             \
+    (VECTOR_ACC_TYPE, k0, a, b, c);                              \
+  })
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && \
+  defined(N)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with
+ * QASYMM/QASYMM_SIGNED data type. The LHS matrix must be reshaped with @ref
+ * CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed The RHS matrix must be reshaped
+ * with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e.
+ * -DACC_DATA_TYPE=uint)
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52
+ * and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0)
+ * must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS
+ * matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option
+ * -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data
+ * type: QASYMM8/QASYMM_SIGNED
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped
+ * matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped
+ * matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in]  k                                 Number of columns in LHS matrix and rows in RHS
+ * matrix not reshaped.
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+                                                IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z,
+                                                uint rhs_stride_z, uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                ,
+                                                uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  __global DATA_TYPE *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes +
+                                 (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y +
+                                 (z * lhs_stride_z);
+
+  // Compute RHS matrix address
+  __global DATA_TYPE *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes +
+                                 (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_addr += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+  for (int i = 0; i < k; i += K0)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X, zrhs);
+
+    // Partial matrix multiplication M0,N0,K0
+    ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+
+    // Update address
+    lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
+    rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Convert and store output block
+  CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is
+ * transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e.
+ * -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at
+ * compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data
+ * type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped
+ * matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped
+ * matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in
+ * unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+                                              IMAGE_DECLARATION(dst), uint lhs_stride_z,
+                                              uint rhs_stride_z, uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                              ,
+                                              uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                              ,
+                                              uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X +
+                    (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+  for (int i = 0; i < K; i += K0)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
+
+    // Partial matrix multiplication M0,N0,K0
+    ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+
+    lhs_offset += K0;
+    rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Convert and store output block
+  CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#if defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with fused output stage
+ * using fixed-point arithmetic. The LHS matrix is NOT reshaped The RHS matrix is reshaped with @ref
+ * CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e.
+ * -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at
+ * compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @note The offset, scalar scale factor and number of bits to shift right of output tensor must be
+ * passed at compile time using -DRESULT_OFFSET, -RESULT_MULTIPLIER and -DRESULT_SHIFT
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ * @note In case of per-channel quantization of matrix B, -DPER_CHANNEL_QUANTIZATION must be passed
+ * at compile time.
+ *
+ * @param[in]  lhs_ptr                                          Pointer to the LHS reshaped matrix.
+ * Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_x                                     Stride of the LHS reshaped matrix in
+ * X dimension (in bytes)
+ * @param[in]  lhs_step_x                                       src_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                                     Stride of the LHS reshaped matrix in
+ * Y dimension (in bytes)
+ * @param[in]  lhs_step_y                                       src_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes                The offset of the first element in
+ * the LHS reshaped matrix
+ * @param[in]  rhs_ptr                                          Pointer to the RHS reshaped matrix.
+ * Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                                     Stride of the RHS reshaped matrix in
+ * X dimension (in bytes)
+ * @param[in]  rhs_step_x                                       src_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                                     Stride of the RHS reshaped matrix in
+ * Y dimension (in bytes)
+ * @param[in]  rhs_step_y                                       src_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes                The offset of the first element in
+ * the RHS reshaped matrix
+ * @param[out] dst_ptr                                          Pointer to the destination matrix
+ * Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                                     Stride of the destination matrix in
+ * X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination matrix in
+ * Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in
+ * the destination matrix
+ * @param[in]  lhs_stride_z                                     Stride of the LHS reshaped matrix in
+ * Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                                     Stride of the RHS reshaped matrix in
+ * Z dimension (in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the destination tensor in
+ * Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                              (Optional) Bottom paddings for LHS
+ * matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                              (Optional) Bottom paddings for the
+ * output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: S32
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: S32
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases
+ * tensor. Supported data type: S32
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases
+ * tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first
+ * element in the biases tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output
+ * multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output
+ * multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional)
+ * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output
+ * shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output
+ * shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x *
+ * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first
+ * element in the output shifts vector
+ */
+__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(
+  IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), IMAGE_DECLARATION(dst), uint lhs_stride_z,
+  uint rhs_stride_z, uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+  ,
+  uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  ,
+  uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+#if defined(A_OFFSET)
+  ,
+  IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+  IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+    ,
+  VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS)
+#if defined(PER_CHANNEL_QUANTIZATION)
+    ,
+  VECTOR_DECLARATION(result_multipliers), VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+)
+{
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X +
+                    (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+  for (int i = 0; i < K; i += K0)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
+
+    // Partial matrix multiplication M0,N0,K0
+    ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+
+    lhs_offset += K0;
+    rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+  }
+
+  // Result of MM is of type DATA_TYPE
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0) * sizeof(DATA_TYPE) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Convert result of matrix multiplication to S32
+  REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_int);
+
+  int batch_id = z;
+#if defined(DEPTH_GEMM3D)
+  batch_id /= (int)DEPTH_GEMM3D;
+#endif // defined(DEPTH_GEMM3D)
+
+  // Offset contribution: c += (A_OFFSET * sum_col) + (B_OFFSET * sum_row) +  K_OFFSET;
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(int, N0), offset_s32_, K_OFFSET);
+
+#if defined(A_OFFSET)
+  // Compute the offset contribution due to A_OFFSET
+  __global uchar *sum_col_addr =
+    sum_col_ptr + sum_col_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
+
+#if defined(SUM_COL_HAS_BATCHES)
+  sum_col_addr += z * sum_col_stride_y;
+#endif // defined(SUM_COL_HAS_BATCHES)
+  VEC_DATA_TYPE(int, N0)
+  a_offset_s32 = VLOAD(N0)(0, (__global int *)sum_col_addr);
+  a_offset_s32 *= (VEC_DATA_TYPE(int, N0))A_OFFSET;
+
+  REPEAT_ADD_VECTOR_TO_VAR(M0, offset_s32_, a_offset_s32);
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+  // Compute the offset contribution due to B_OFFSET
+  __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes +
+                                 (y * (uint)M0) * sizeof(int) + z * sum_row_stride_y;
+
+#if defined(HEIGHT_GEMM3D) && defined(DEPTH_GEMM3D)
+  sum_row_addr += (batch_id % (int)DEPTH_GEMM3D) * (int)HEIGHT_GEMM3D * sizeof(int);
+#endif // defined(HEIGHT_GEMM3D) && defined(DEPTH_GEMM3D)
+  LOAD_SCALAR_AS_VECTOR(M0, N0, int, b_offset_s32_, sum_row_addr, 0, sum_row_stride_x);
+
+  REPEAT_MLA_VAR_WITH_CONST_VEC(M0, offset_s32_, b_offset_s32_, (VEC_DATA_TYPE(int, N0))B_OFFSET);
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr =
+    biases_ptr + biases_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
+
+  VEC_DATA_TYPE(int, N0)
+  bias_values = VLOAD(N0)(0, (__global int *)bias_addr);
+  REPEAT_ADD_VECTOR_TO_VAR(M0, offset_s32_, bias_values);
+#endif // defined(ADD_BIAS)
+
+  REPEAT_ADD_TWO_VARS(M0, c_int, offset_s32_);
+
+  // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+  __global uchar *result_multipliers_addr = result_multipliers_ptr +
+                                            result_multipliers_offset_first_element_in_bytes +
+                                            (x * (uint)N0) * sizeof(int);
+  __global uchar *result_shifts_addr =
+    result_shifts_ptr + result_shifts_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
+
+  VEC_DATA_TYPE(int, N0)
+  res_mul = VLOAD(N0)(0, (__global int *)result_multipliers_addr);
+  VEC_DATA_TYPE(int, N0)
+  res_shift = VLOAD(N0)(0, (__global int *)result_shifts_addr);
+
+  REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(M0, N0, c_int, res_mul, res_shift);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+
+#if RESULT_SHIFT < 0
+  REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(M0, N0, c_int, RESULT_MULTIPLIER,
+                                                         RESULT_SHIFT);
+#else  // RESULT_SHIFT >= 0
+  REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(M0, N0, c_int, RESULT_MULTIPLIER,
+                                                      RESULT_SHIFT);
+#endif // RESULT_SHIFT < 0
+
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+  // Add the offset terms to GEMM's result
+  REPEAT_ADD_CONST_TO_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, RESULT_OFFSET);
+
+#if defined(MIN_BOUND)
+  REPEAT_MAX_CONST_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  REPEAT_MIN_CONST_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Convert and store output block (does convert saturate)
+  CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c_int, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) &&
+       // defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(K)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is NOT reshaped
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e.
+ * -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of N0 columns to process must be passed at compile time using -DN0 (i.e. -DN0=2)
+ * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (i.e.,
+ * -DK0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data
+ * type: QASYMM8
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped
+ * matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped
+ * matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in
+ * unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+                                 IMAGE_DECLARATION(dst), uint lhs_stride_z, uint rhs_stride_z,
+                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                 ,
+                                 uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                 ,
+                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+  int i = 0;
+
+  for (; i <= (K - K0); i += K0)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+
+    // Partial matrix multiplication M0,N0,K0
+#if (GPU_ARCH == GPU_ARCH_MIDGARD)
+    ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+       // Transpose the values from RHS matrix
+    TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE);
+
+    ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+    // Update the offset
+    lhs_offset += K0;
+    rhs_offset += K0 * rhs_stride_y;
+  }
+
+  // Left-over for loop
+  for (; i < K; ++i)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+
+    // Partial matrix multiplication M0,N0,1
+#if (GPU_ARCH == GPU_ARCH_MIDGARD)
+    ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+       // Transpose the values from RHS matrix
+    TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE);
+
+    ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+    // Update the offset
+    lhs_offset += 1;
+    rhs_offset += rhs_stride_y;
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Convert and store output block
+  CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(K)
+
+#if defined(COLS_A)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix
+ * A. It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at
+ * compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE
+ * (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g.
+ * -DSCALAR=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type:
+ * QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+  sum_row_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))0;
+  ACC_DATA_TYPE sum_row = 0;
+
+  __global const DATA_TYPE *matrix_a =
+    (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y +
+                                 get_global_id(1) * src_stride_z);
+
+  int i = 0;
+
+  // This for loop performs 16 accumulations
+  for (; i <= ((int)COLS_A - 16); i += 16)
+  {
+    const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i);
+
+    sum_row_32 += CONVERT(a0.s0123, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) +
+                  CONVERT(a0.s4567, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) +
+                  CONVERT(a0.s89AB, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) +
+                  CONVERT(a0.sCDEF, VEC_DATA_TYPE(ACC_DATA_TYPE, 4));
+  }
+
+  // This for loop performs the leftover accumulations
+  for (; i < COLS_A; ++i)
+  {
+    sum_row += (ACC_DATA_TYPE)matrix_a[i];
+  }
+
+  sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3;
+
+#if defined(SCALAR)
+  sum_row *= (int)SCALAR;
+#endif // defined(SCALAR)
+  *((__global int *)dst.ptr) = (int)sum_row;
+}
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A
+ * using the arm dot product instruction. It is also possible to multiply each reduced row by a
+ * scalar value, if SCALAR is passed at compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE
+ * (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g.
+ * -DSCALAR=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type:
+ * QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  ACC_DATA_TYPE sum_row = 0;
+
+  __global const DATA_TYPE *matrix_a =
+    (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y +
+                                 get_global_id(1) * src_stride_z);
+
+  int i = 0;
+
+  // This for loop performs 16 accumulations
+  for (; i <= ((int)COLS_A - 32); i += 32)
+  {
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    a0 = vload16(0, matrix_a + i);
+
+    sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+
+    a0 = vload16(1, matrix_a + i);
+
+    sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+  }
+
+  // This for loop performs the leftover accumulations
+  for (; i < COLS_A; ++i)
+  {
+    sum_row += (ACC_DATA_TYPE)matrix_a[i];
+  }
+
+#if defined(SCALAR)
+  sum_row *= (int)SCALAR;
+#endif // defined(SCALAR)
+  *((__global int *)dst.ptr) = (int)sum_row;
+}
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#endif // defined(COLS_A)
+
+#if defined(COLS_B) && defined(ROWS_B)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of
+ * Matrix B. It is also possible to multiply each reduced column by a scalar value, if SCALAR is
+ * passed at compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix B columns and rows needs to be passed at compile time using
+ * -DCOLS_B and -DROWS_B
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE
+ * (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (i.e.
+ * -DSCALAR=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type:
+ * QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * tensor
+ */
+__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
+  sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 16))0;
+
+  __global const DATA_TYPE *matrix_b =
+    (__global const DATA_TYPE *)(src.ptr + get_global_id(1) * src_stride_z);
+
+  int i = 0;
+  // This for loop performs 4 accumulations
+  for (; i <= ((int)ROWS_B - 4); i += 4)
+  {
+    const VEC_DATA_TYPE(DATA_TYPE, 16) b0 = vload16(0, matrix_b + 0 * src_stride_y);
+    const VEC_DATA_TYPE(DATA_TYPE, 16) b1 = vload16(0, matrix_b + 1 * src_stride_y);
+    const VEC_DATA_TYPE(DATA_TYPE, 16) b2 = vload16(0, matrix_b + 2 * src_stride_y);
+    const VEC_DATA_TYPE(DATA_TYPE, 16) b3 = vload16(0, matrix_b + 3 * src_stride_y);
+
+    sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) +
+                  CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) +
+                  CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) +
+                  CONVERT(b3, VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+
+    matrix_b += 4 * src_stride_y;
+  }
+
+  // This for loop perfoms the leftover accumulations
+  for (; i < (int)ROWS_B; ++i)
+  {
+    const VEC_DATA_TYPE(DATA_TYPE, 16) b0 = vload16(0, matrix_b);
+
+    sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+
+    matrix_b += src_stride_y;
+  }
+
+#if defined(SCALAR)
+  sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, 16))SCALAR;
+#endif // defined(SCALAR)
+  VSTORE(16)
+  (convert_int16(sum_col_32), 0, (__global int *)dst.ptr);
+}
+#endif // defined(COLS_B) && defined(ROWS_B)
+
+#endif // defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
+
+#if defined(K_OFFSET)
+
+/* Helper function used to calculate the offset contribution after matrix multiplication.
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
+ * and calculates the offset contribution of matrix A and matrix B.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns)
+ * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at
+ * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at
+ * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually
+ * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * @param[in] x                                     get_global_id(0) * 4
+ * @param[in] y                                     get_global_id(1)
+ * @param[in] z                                     get_global_id(2)
+ * @param[in] sum_col_ptr                           (Optional) Pointer to the source tensor.
+ * Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x                      (Optional) Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] sum_col_step_x                        (Optional) sum_col_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y                      (Optional) Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] sum_col_step_y                        (Optional) sum_col_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the source tensor
+ * @param[in] sum_row_ptr                           (Optional) Pointer to the source tensor.
+ * Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x                      (Optional) Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] sum_row_step_x                        (Optional) sum_row_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y                      (Optional) Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] sum_row_step_y                        (Optional) sum_row_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the source tensor
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in
+ * the biases tensor
+ */
+inline int4 offset_contribution(int x, int y, int z
+#if defined(A_OFFSET)
+                                ,
+                                IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                  ,
+                                IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                  ,
+                                VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS)
+)
+{
+  int4 a_offset_s32 = (int4)0;
+  int4 b_offset_s32 = (int4)0;
+
+  int batch_id = z;
+#if defined(DEPTH_INPUT3D)
+  batch_id /= (int)DEPTH_INPUT3D;
+#endif // defined(DEPTH_INPUT3D)
+
+#if defined(A_OFFSET)
+  // Compute the offset contribution due to A_OFFSET
+  __global uchar *sum_col_addr =
+    sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
+
+  // Compute the offset contribution due to A_OFFSET
+#if defined(SUM_COL_HAS_BATCHES)
+  a_offset_s32 = vload4(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
+#else  // defined(SUM_COL_HAS_BATCHES)
+  a_offset_s32 = vload4(0, (__global int *)sum_col_addr);
+#endif // defined(SUM_COL_HAS_BATCHES)
+
+  a_offset_s32 *= (int4)A_OFFSET;
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+  // Compute the offset contribution due to A_OFFSET
+  __global uchar *sum_row_addr =
+    sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
+
+  // Compute the offset contribution due to B_OFFSET
+#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+  b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) +
+                           (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
+#else  // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+  b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
+#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+  b_offset_s32 *= (int4)B_OFFSET;
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+  int4 biases_values = vload4(0, (__global int *)bias_addr);
+  b_offset_s32 += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+  return (int4)K_OFFSET + a_offset_s32 + b_offset_s32;
+}
+
+/* OpenCL kernel used to add the offset contribution after matrix multiplication. The computation is
+ * performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns)
+ * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at
+ * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at
+ * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually
+ * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * @param[in] mm_result_ptr                           Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in] mm_result_stride_x                      Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] mm_result_step_x                        mm_result_stride_x * number of elements along
+ * X processed per workitem(in bytes)
+ * @param[in] mm_result_stride_y                      Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] mm_result_step_y                        mm_result_stride_y * number of elements along
+ * Y processed per workitem(in bytes)
+ * @param[in] mm_result_stride_z                      Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] mm_result_step_z                        mm_result_stride_z * number of elements along
+ * Z processed per workitem(in bytes)
+ * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[in] sum_col_ptr                             (Optional) Pointer to the source tensor.
+ * Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x                        (Optional) Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] sum_col_step_x                          (Optional) sum_col_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y                        (Optional) Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] sum_col_step_y                          (Optional) sum_col_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes   (Optional) The offset of the first element in
+ * the source tensor
+ * @param[in] sum_row_ptr                             (Optional) Pointer to the source tensor.
+ * Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x                        (Optional) Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] sum_row_step_x                          (Optional) sum_row_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y                        (Optional) Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] sum_row_step_y                          (Optional) sum_row_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes   (Optional) The offset of the first element in
+ * the source tensor
+ * @param[in] biases_ptr                              (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x                         (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in] biases_step_x                           (Optional) biases_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes    (Optional) The offset of the first element in
+ * the biases tensor
+ */
+__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                             ,
+                                           IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                             ,
+                                           IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                             ,
+                                           VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS))
+)
+{
+  const int x = get_global_id(0) * 4;
+  const int y = get_global_id(1);
+  const int z = get_global_id(2);
+
+  // Compute offset contribution
+  int4 offset_term_s32 = offset_contribution(
+    x, y, z
+#if defined(A_OFFSET)
+    ,
+    sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y,
+    sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y,
+    sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+    ,
+    biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+  );
+
+  __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes +
+                                   x * sizeof(int) + y * mm_result_stride_y +
+                                   z * mm_result_stride_z;
+
+  int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+  // Add the offset terms to GEMM's result
+  in_s32 += offset_term_s32;
+
+  // Store the result with the offset contribution
+  vstore4(in_s32, 0, (__global int *)mm_result_addr);
+}
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && \
+  defined(OUTPUT_DATA_TYPE)
+/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and
+ * it quantizes down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of
+ * @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and
+ * quantizes to uint8 through the output stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns)
+ * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at
+ * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at
+ * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually
+ * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * This result is quantized down to uint8/int8 using the output stage. The output stage computes the
+ * following operations:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND
+ * are passed at compile time)
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  mm_result_ptr                                    Pointer to the source tensor.
+ * Supported data type: S32
+ * @param[in]  mm_result_stride_x                               Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of
+ * elements along Z processed per workitem(in bytes)
+ * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in
+ * the source tensor
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases
+ * tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases
+ * tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first
+ * element in the biases tensor
+ * @param[out] dst_ptr                                          Pointer to the destination tensor
+ * Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                                     Stride of the destination tensor in
+ * X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination tensor in
+ * Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in]  dst_step_z                                       src_stride_z * number of elements
+ * along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in
+ * the destination tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output
+ * multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output
+ * multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional)
+ * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output
+ * shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output
+ * shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x *
+ * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first
+ * element in the output shifts vector
+ */
+__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                                           ,
+                                                         IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                                           ,
+                                                         IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                                           ,
+#if defined(ADD_BIAS)
+                                                         VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                         TENSOR3D_DECLARATION(dst)
+#if defined(PER_CHANNEL_QUANTIZATION)
+                                                           ,
+                                                         VECTOR_DECLARATION(result_multipliers),
+                                                         VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+)
+{
+  const int x = get_global_id(0) * 4;
+  const int y = get_global_id(1);
+  const int z = get_global_id(2);
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+  // Compute offset contribution
+  int4 offset_term_s32 = offset_contribution(
+    x, y, z
+#if defined(A_OFFSET)
+    ,
+    sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y,
+    sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y,
+    sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+    ,
+    biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+  );
+
+  __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes +
+                                   x * sizeof(int) + y * mm_result_stride_y +
+                                   z * mm_result_stride_z;
+
+  int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+  // Add the offset terms to GEMM's result
+  in_s32 += offset_term_s32;
+
+  // -------------- OUTPUT STAGE
+
+  // Add the offset terms to GEMM's result
+  in_s32 += (int4)RESULT_OFFSET;
+
+  // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+  __global uchar *result_multipliers_addr =
+    result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
+  __global uchar *result_shifts_addr =
+    result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
+  int4 result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr);
+  int4 result_shifts_values = vload4(0, (__global int *)result_shifts_addr);
+
+  in_s32 *= result_multipliers_values;
+  in_s32 >>= result_shifts_values;
+#else  // defined(PER_CHANNEL_QUANTIZATION)
+  in_s32 *= RESULT_MULTIPLIER;
+
+  in_s32 >>= RESULT_SHIFT;
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+  VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+  res = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+  res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+
+/* OpenCL kernel used to add the offset contribution after matrix multiplication and it quantizes
+ * down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), adds to
+ * it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output
+ * stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns)
+ * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at
+ * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at
+ * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually
+ * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * This result is quantized down to uint8/int8 using the output stage. The output stage computes the
+ * following operations:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by
+ * result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  mm_result_ptr                                    Pointer to the source tensor.
+ * Supported data type: S32
+ * @param[in]  mm_result_stride_x                               Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of
+ * elements along Z processed per workitem(in bytes)
+ * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in
+ * the source tensor
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases
+ * tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases
+ * tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first
+ * element in the biases tensor
+ * @param[out] dst_ptr                                          Pointer to the destination tensor
+ * Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                                     Stride of the destination tensor in
+ * X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination tensor in
+ * Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in]  dst_step_z                                       src_stride_z * number of elements
+ * along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in
+ * the destination tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output
+ * multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output
+ * multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional)
+ * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output
+ * shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output
+ * shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x *
+ * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first
+ * element in the output shifts vector
+ */
+__kernel void
+  gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                                          ,
+                                                        IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                                          ,
+                                                        IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                                          ,
+#if defined(ADD_BIAS)
+                                                        VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                        TENSOR3D_DECLARATION(dst)
+#if defined(PER_CHANNEL_QUANTIZATION)
+                                                          ,
+                                                        VECTOR_DECLARATION(result_multipliers),
+                                                        VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+  )
+{
+  const int x = get_global_id(0) * 4;
+  const int y = get_global_id(1);
+  const int z = get_global_id(2);
+
+  // Compute offset contribution
+  int4 offset_term_s32 = offset_contribution(
+    x, y, z
+#if defined(A_OFFSET)
+    ,
+    sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y,
+    sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y,
+    sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+    ,
+    biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+  );
+
+  __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes +
+                                   x * sizeof(int) + y * mm_result_stride_y +
+                                   z * mm_result_stride_z;
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+  int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+  // Add the offset terms to GEMM's result
+  in_s32 += offset_term_s32;
+
+  // -------------- OUTPUT STAGE
+
+  // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+  __global uchar *result_multipliers_addr =
+    result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
+  __global uchar *result_shifts_addr =
+    result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
+  int4 result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr);
+  int4 result_shifts_values = vload4(0, (__global int *)result_shifts_addr);
+
+  int4 in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(
+    in_s32, result_multipliers_values, result_shifts_values, 4);
+  int4 in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(
+    in_s32, result_multipliers_values, result_shifts_values, 4);
+  in_s32 = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+
+#if RESULT_SHIFT < 0
+  in_s32 =
+    ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+#else  // RESULT_SHIFT >= 0
+  in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+  // Add the offset terms to GEMM's result
+  in_s32 += (int4)RESULT_OFFSET;
+
+  VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+  res = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+  res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) &&
+       // defined(OUTPUT_DATA_TYPE)
+
+#endif // defined(K_OFFSET)
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to
+ * QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value and processes it to obtain the final
+ * QASYMM8/QASYMM8_SIGNED value. The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND
+ * are passed at compile time)
+ *  -# Clamp the resulting int32 values:
+ *  -#  - to the [0..255] range and cast to QASYMM8.
+ *  -#  - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source
+ * tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data
+ * type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the
+ * destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                  VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                  TENSOR3D_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  int x = get_global_id(0) * 4;
+  int y = get_global_id(1);
+  int z = get_global_id(2);
+
+  __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) +
+                             y * src_stride_y + z * src_stride_z;
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+  int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+  int4 biases_values = vload4(0, (__global int *)bias_addr);
+  input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+  // Add the offset terms to GEMM's result
+  input_values += (int4)RESULT_OFFSET;
+
+  // Multiply by result_mult_int and shift
+  input_values *= RESULT_MULT_INT;
+
+#if RESULT_SHIFT < 0
+  input_values >>= -RESULT_SHIFT;
+#else  // RESULT_SHIFT >= 0
+  input_values >>= RESULT_SHIFT;
+#endif // RESULT_SHIFT < 0
+
+  VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+  res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+  res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+
+#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && \
+  defined(RESULT_SHIFT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to
+ * QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and
+ * processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. The following computations will be
+ * performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by
+ * result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER
+ * and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source
+ * tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data
+ * type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the
+ * destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                             VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                             TENSOR3D_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  int x = get_global_id(0) * 4;
+  int y = get_global_id(1);
+  int z = get_global_id(2);
+
+  __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) +
+                             y * src_stride_y + z * src_stride_z;
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+  int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+  int4 biases_values = vload4(0, (__global int *)bias_addr);
+  input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+  // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+  input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(
+    input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#else  // RESULT_SHIFT >= 0
+  input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(
+    input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
+  // Add the offset terms to GEMM's result
+  input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;
+
+  VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+  res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+  res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) &&
+       // defined(RESULT_SHIFT)
+
+#if defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and
+ * processes it to obtain the final QSYMM16 value. The following computations will be performed by
+ * the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by
+ * result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [-32768..32767] range and cast to QSYMM16.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source
+ * tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data
+ * type: QSYMM16
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the
+ * destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                                     VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                                     TENSOR3D_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  int x = get_global_id(0) * 4;
+  int y = get_global_id(1);
+  int z = get_global_id(2);
+
+  __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) +
+                             y * src_stride_y + z * src_stride_z;
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x * 2 + y * dst_stride_y + z * dst_stride_z;
+
+  int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+  int4 biases_values = vload4(0, (__global int *)bias_addr);
+  input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+  // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+  input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(
+    input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#else  // RESULT_SHIFT >= 0
+  input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(
+    input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
+  short4 res = convert_short4_sat(input_values);
+
+#if defined(MIN_BOUND)
+  res = max(res, (short4)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (short4)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global short *)dst_addr);
+}
+#endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to
+ * QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and
+ * processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. The following computations will be
+ * performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by
+ * result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Requantize
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset and scalar scale factor must be passed at compile time using
+ * -DRESULT_OFFSET, -DREAL_MULTIPLIER
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source
+ * tensor
+ * @param[in]  biases_ptr                           Pointer to the biases tensor. Supported data
+ * type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      Stride of the biases tensor in X dimension (in
+ * bytes)
+ * @param[in]  biases_step_x                        biases_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes The offset of the first element in the biases
+ * tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data
+ * type: QASYMM8
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                         Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in]  dst_step_w                           src_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the
+ * destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                        VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+#if defined(DST_HEIGHT)
+                                                        TENSOR4D_DECLARATION(dst))
+#else  // defined(DST_HEIGHT)
+                                                        TENSOR3D_DECLARATION(dst))
+#endif // defined(DST_HEIGHT)
+{
+  // Compute source and destination addresses
+  int x = get_global_id(0) * 4;
+  int y = get_global_id(1);
+  int z = get_global_id(2);
+
+  __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) +
+                             y * src_stride_y + z * src_stride_z;
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+  int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+  int4 biases_values = vload4(0, (__global int *)bias_addr);
+  input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+  // Convert to float
+  float4 input_values_f = convert_float4(input_values);
+  input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
+
+  VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+  res = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+  res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl
new file mode 100644
index 000000000..51919c8a5
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(CONSTANT_VALUE) // Check for compile time constants
+
+/** Fill the tensor's planes with all value
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DCONSTANT_VALUE = The value use to fill the tensor's planes
+ * -# -DVEC_SIZE = Vector size
+ * -# -DLAST_ACCESSED_X = The element that is on the X border (threads trying to set this, might
+ * need to step back a bit)
+ *
+ * @param[in] tensor_ptr                           Pointer to the source image. Data types
+ * supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] tensor_stride_x                      Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] tensor_step_x                        tensor_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] tensor_stride_y                      Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] tensor_step_y                        tensor_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] tensor_offset_first_element_in_bytes The offset of the first element in the source
+ * image
+ * @param[in] value                                The value used to fill the pages of the tensor
+ */
+__kernel void memset(TENSOR3D_DECLARATION(tensor))
+{
+  Tensor3D tensor = CONVERT_TO_TENSOR3D_STRUCT(tensor);
+
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
+  // Check if access on width gets out of bounds
+  // If it does shift access vector to access elements within bounds
+  const int xi = (int)(get_global_id(0) * VEC_SIZE);
+  tensor.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * tensor_stride_x;
+#endif // defined(LAST_ACCESSED_X)
+
+  VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+  data = (DATA_TYPE)(CONSTANT_VALUE);
+
+  VSTORE(VEC_SIZE)
+  (data, 0, (__global DATA_TYPE *)tensor.ptr);
+#else  // !defined(VEC_SIZE)
+  *((__global DATA_TYPE *)(tensor.ptr)) = (DATA_TYPE)(CONSTANT_VALUE);
+#endif // defined(VEC_SIZE)
+}
+
+#endif // Check for compile time constants
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl
new file mode 100644
index 000000000..96f2f9ef0
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && \
+  defined(SRC_WIDTH)
+
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_SELECT VEC_DATA_TYPE(SELECT_DT, VEC_SIZE)
+#define OFFSETS VEC_OFFS(VEC_SELECT, VEC_SIZE)
+
+#if defined(CONST_VAL)
+/** Perform a pad operation when PaddingMode is CONSTANT
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
+ * @note Constant value used to fill the pads must be passed using the -DCONST_VAL compile flag,
+ * e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g.
+ * -DPAD_X_BEFORE=5
+ * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g.
+ * -DSRC_WIDTH=224
+ * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile
+ * flag, e.g. -DSELECT_DT=float
+ * @note In case pad left is more than the vector size, the number of threads to skip along the X
+ * axis must be passed using the -DNUM_THREADS_TO_SKIP_X compile flag, e.g.
+ * -DNUM_THREADS_TO_SKIP_X=1. This is defined as (PAD_X_BEFORE / VEC_SIZE)
+ * @note If pad also needs to be added to the top of the tensor, the following compile flags must be
+ * passed at compile time:
+ *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
+ *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
+ * @note If pad also needs to be added to the depth of the tensor, the following compile flags must
+ * be passed at compile time:
+ *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g.
+ * -DPAD_Z_BEFORE=3)
+ *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
+ * @note If pad also needs to be added to the batch of the tensor, the following compile flags must
+ * be passed at compile time:
+ *       -# -DPAD_W_BEFORE: Pad to add before the first batch of the input tensor (e.g.
+ * -DPAD_W_BEFORE=3)
+ *       -# -DSRC_BATCH: Input tensor's batch size (e.g. -DSRC_BATCH=4)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types:
+ * U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * image
+ * @param[in]  batch                             (Optional) Batch index if 4D pad must be applied
+ */
+__kernel void pad_layer_constant(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)
+#if defined(PAD_W_BEFORE)
+                                                              ,
+                                 uint batch
+#endif // defined(PAD_W_BEFORE)
+)
+{
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+  const int z = get_global_id(2);
+
+  uint cond = 0;
+
+#if defined(PAD_W_BEFORE)
+  cond |= batch < PAD_W_BEFORE || batch >= (SRC_BATCH + PAD_W_BEFORE);
+#endif // defined(PAD_W_BEFORE)
+#if defined(PAD_Z_BEFORE)
+  cond |= z < PAD_Z_BEFORE || z >= (SRC_DEPTH + PAD_Z_BEFORE);
+#endif // defined(PAD_Z_BEFORE)
+
+  if (cond)
+  {
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    VSTORE(VEC_SIZE)
+    ((VEC_TYPE)CONST_VAL, 0, (__global DATA_TYPE *)dst.ptr);
+  }
+  else
+  {
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(NUM_THREADS_TO_SKIP_X)
+    /* In case the pad left is greater than the vector size, and we are past the threads operating
+     * solely on pad values, the input pointer must be brought back along the X axis to start from
+     * the first non-pad values.
+     *
+     * E.g. with VEC_SIZE=2, PAD_X_BEFORE=5, CONST_VAL=0 and 1D input |1 2 3 4 5 6|:
+     *  -# The first thread will compute the output values |0 0| since it detects (x_outs == (0, 1))
+     * < PAD_X_BEFORE
+     *  -# The second thread will compute the output values |0 0| since it detects (x_outs == (2,
+     * 3)) < PAD_X_BEFORE
+     *  -# The third thread should compute |0 1|, however the input pointer is now ahead of ((x *
+     * VEC_SIZE) == 4) values, reading |4 5|
+     *  -# To detect this, we use ((PAD_X_BEFORE / VEC_SIZE) == NUM_THREADS_TO_SKIP_X == 2) and
+     * check that it is >= to the current x
+     *  -# So, we bring the pointer back of NUM_THREADS_TO_SKIP_X threads, which means multiplying
+     * this constant by the input's step along the X axis
+     *  -# Now that the pointer is back of ((NUM_THREADS_TO_SKIP_X * src_step_x) == 4) values, it
+     * will read the desired values |0 1|
+     */
+    src.ptr -= select(0u, NUM_THREADS_TO_SKIP_X * src_step_x, x >= NUM_THREADS_TO_SKIP_X);
+#endif // defined(NUM_THREADS_TO_SKIP_X)
+#if defined(PAD_Z_BEFORE)
+    src.ptr -= PAD_Z_BEFORE * src_step_z;
+#endif // defined(PAD_Z_BEFORE)
+#if defined(PAD_W_BEFORE)
+    src.ptr -= PAD_W_BEFORE * SRC_DEPTH * src_step_z;
+#endif // defined(PAD_W_BEFORE)
+
+    VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+
+    VEC_INT xs_out = (VEC_INT)(x * VEC_SIZE) + CONVERT(OFFSETS, VEC_INT);
+    VEC_INT cond = xs_out < (VEC_INT)PAD_X_BEFORE || xs_out >= (VEC_INT)(SRC_WIDTH + PAD_X_BEFORE);
+#if defined(PAD_Y_BEFORE)
+    cond |=
+      (VEC_INT)y < (VEC_INT)PAD_Y_BEFORE || (VEC_INT)y >= (VEC_INT)(SRC_HEIGHT + PAD_Y_BEFORE);
+#endif // defined(PAD_Y_BEFORE)
+    VSTORE(VEC_SIZE)
+    (select(src_vals, (VEC_TYPE)CONST_VAL, CONVERT(cond, VEC_SELECT)), 0,
+     (__global DATA_TYPE *)dst.ptr);
+  }
+}
+#endif // defined(CONST_VAL)
+
+#if defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) &&         \
+  defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && \
+  defined(AFTER_PAD_FACT_X)
+
+#define SCALAR_COND(x) (VEC_SELECT) x == (VEC_SELECT)1
+#define ROTATE_REVERSE(x, n) ROTATE(REVERSE(x, VEC_SIZE), VEC_SIZE, n)
+#define SYMM_REFL_LEFT(x, n0, n1) \
+  select(ROTATE_REVERSE(x, n1), ROTATE(x, VEC_SIZE, n0), OFFSETS >= (VEC_SELECT)n0)
+#define SYMM_REFL_RIGHT(x, n0, n1) \
+  select(ROTATE(x, VEC_SIZE, n0), ROTATE_REVERSE(x, n1), OFFSETS >= (VEC_SELECT)n0)
+
+/** Perform a pad operation when PaddingMode is SYMMETRIC
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
+ * @note Constant value must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g.
+ * -DPAD_X_BEFORE=5
+ * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g.
+ * -DSRC_WIDTH=224
+ * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile
+ * flag, e.g. -DSELECT_DT=float
+ * @note Number of values to the left when operating across left padding must be passed using the
+ * -DPAD_X_BEFORE_REMAINDER compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=5
+ * @note Number of values to the left when operating across right padding must be passed using the
+ * -DPAD_X_AFTER_REMAINDER compile flag, e.g. -DPAD_X_AFTER_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_BEFORE_REMAINDER + 1) must be passed when mode is
+ * REFLECT using the -DPAD_X_BEFORE_REMAINDER_REFL compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_AFTER_REMAINDER - 1) must be passed using the
+ * -DPAD_X_AFTER_REMAINDER_REFL compile flag, e.g. -DPAD_X_AFTER_REMAINDER=5
+ * @note When after pad X, starting point to read backward from must be passed using the
+ * -DAFTER_PAD_FACT_X compile flag, e.g. -DAFTER_PAD_FACT_X=253
+ * @note If padding mode is REFLECT, the -DIS_REFLECT compile flag must be set to 1, else it must be
+ * set to 0
+ * @note If pad also needs to be added to the top of the tensor, the following compile flags must be
+ * passed at compile time:
+ *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
+ *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
+ * @note If pad also needs to be added to the depth of the tensor, the following compile flags must
+ * be passed at compile time:
+ *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g.
+ * -DPAD_Z_BEFORE=3)
+ *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
+ * @note If the starting point to read backward from is less than the output's last element accessed
+ * in the X, the following compile flags must be passed at compile time to avoid negative offsets:
+ *       -# -DAFTER_PAD_REM: Defines how much to rotate the vector if the backward calculation
+ * attempted to read from a negative offset (e.g. -DAFTER_PAD_REM=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types:
+ * U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * image
+ */
+__kernel void pad_layer_symmetric_reflect(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+  // Get current thread position
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+  const int z = get_global_id(2);
+
+  // Define conditions based on the thread X position w.r.t. pad left and right
+  const int x_out_first = x * VEC_SIZE;
+  const int x_out_last = x_out_first + VEC_SIZE;
+  const int is_before_pad_left = (x_out_last <= PAD_X_BEFORE);
+  const int is_across_pad_left = (x_out_first < PAD_X_BEFORE) && (x_out_last > PAD_X_BEFORE);
+  const int is_inside_input =
+    (x_out_first >= PAD_X_BEFORE) && (x_out_last <= (SRC_WIDTH + PAD_X_BEFORE));
+  const int is_across_pad_right =
+    (x_out_first < (SRC_WIDTH + PAD_X_BEFORE)) && (x_out_last > (SRC_WIDTH + PAD_X_BEFORE));
+  const int is_after_pad_right = (x_out_first >= (SRC_WIDTH + PAD_X_BEFORE));
+
+  // Calculate base pointers
+  __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes;
+  Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+  // Calculate input tensor's offset based on the defined conditions
+  int x_offset = 0;
+  x_offset = select(x_offset, PAD_X_BEFORE - x_out_last + IS_REFLECT, is_before_pad_left);
+  x_offset = select(x_offset, x_out_first - PAD_X_BEFORE, is_inside_input);
+  x_offset = select(x_offset, SRC_WIDTH - VEC_SIZE, is_across_pad_right);
+  x_offset = select(x_offset, AFTER_PAD_FACT_X - x_out_last, is_after_pad_right);
+
+#if defined(AFTER_PAD_REM)
+  int neg_offs = x_offset < 0;
+  x_offset = max(x_offset, 0);
+#endif // defined(AFTER_PAD_REM)
+
+  // Load input values from the computed offset
+  int y_in = y;
+  int z_in = z;
+#if defined(PAD_Y_BEFORE)
+  y_in = select(y - PAD_Y_BEFORE, PAD_Y_BEFORE - y + IS_REFLECT - 1, y < PAD_Y_BEFORE);
+  y_in = select(y_in, 2 * SRC_HEIGHT + PAD_Y_BEFORE - y - IS_REFLECT - 1,
+                y >= (SRC_HEIGHT + PAD_Y_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+#if defined(PAD_Z_BEFORE)
+  z_in = select(z - PAD_Z_BEFORE, PAD_Z_BEFORE - z + IS_REFLECT - 1, z < PAD_Z_BEFORE);
+  z_in = select(z_in, 2 * SRC_DEPTH + PAD_Z_BEFORE - z - IS_REFLECT - 1,
+                z >= (SRC_DEPTH + PAD_Z_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+
+  src_addr += x_offset * src_stride_x + y_in * src_step_y + z_in * src_step_z;
+
+#if SRC_WIDTH == 1
+  VSTORE(VEC_SIZE)
+  ((VEC_TYPE)(*(__global DATA_TYPE *)src_addr), 0, (__global DATA_TYPE *)dst.ptr);
+#else // SRC_WIDTH == 1
+
+  VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+  // Choose rearrangement policy based on the defined conditions
+  src_vals =
+    select(src_vals, SYMM_REFL_LEFT(src_vals, PAD_X_BEFORE_REMAINDER, PAD_X_BEFORE_REMAINDER_REFL),
+           SCALAR_COND(is_across_pad_left));
+  src_vals =
+    select(src_vals, SYMM_REFL_RIGHT(src_vals, PAD_X_AFTER_REMAINDER, PAD_X_AFTER_REMAINDER_REFL),
+           SCALAR_COND(is_across_pad_right));
+  src_vals = select(src_vals, REVERSE(src_vals, VEC_SIZE),
+                    SCALAR_COND((is_before_pad_left || is_after_pad_right)));
+#if defined(AFTER_PAD_REM)
+  src_vals = select(src_vals, ROTATE(src_vals, VEC_SIZE, AFTER_PAD_REM), SCALAR_COND(neg_offs));
+#endif // defined(AFTER_PAD_REM)
+
+  // Store
+  VSTORE(VEC_SIZE)
+  (src_vals, 0, (__global DATA_TYPE *)dst.ptr);
+#endif // SRC_WIDTH == 1
+}
+#endif // defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) &&
+       // defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) &&
+       // defined(AFTER_PAD_FACT_X)
+#endif // defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) &&
+       // defined(SRC_WIDTH)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h
new file mode 100644
index 000000000..cfc811cce
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_REPEAT_H
+#define ARM_COMPUTE_REPEAT_H
+
+#include "helpers.h"
+
+/** Macros that help in loop unrolling */
+// Repeat macros with 3 param, excluding the implicit ID param
+#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C)
+#define REPEAT_3_2(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(1, P_A, P_B, P_C);         \
+  REPEAT_3_1(P_X, P_A, P_B, P_C)
+#define REPEAT_3_3(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(2, P_A, P_B, P_C);         \
+  REPEAT_3_2(P_X, P_A, P_B, P_C)
+#define REPEAT_3_4(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(3, P_A, P_B, P_C);         \
+  REPEAT_3_3(P_X, P_A, P_B, P_C)
+#define REPEAT_3_5(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(4, P_A, P_B, P_C);         \
+  REPEAT_3_4(P_X, P_A, P_B, P_C)
+#define REPEAT_3_6(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(5, P_A, P_B, P_C);         \
+  REPEAT_3_5(P_X, P_A, P_B, P_C)
+#define REPEAT_3_7(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(6, P_A, P_B, P_C);         \
+  REPEAT_3_6(P_X, P_A, P_B, P_C)
+#define REPEAT_3_8(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(7, P_A, P_B, P_C);         \
+  REPEAT_3_7(P_X, P_A, P_B, P_C)
+#define REPEAT_3_9(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(8, P_A, P_B, P_C);         \
+  REPEAT_3_8(P_X, P_A, P_B, P_C)
+#define REPEAT_3_10(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(9, P_A, P_B, P_C);          \
+  REPEAT_3_9(P_X, P_A, P_B, P_C)
+#define REPEAT_3_11(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(A, P_A, P_B, P_C);          \
+  REPEAT_3_10(P_X, P_A, P_B, P_C)
+#define REPEAT_3_12(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(B, P_A, P_B, P_C);          \
+  REPEAT_3_11(P_X, P_A, P_B, P_C)
+#define REPEAT_3_13(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(C, P_A, P_B, P_C);          \
+  REPEAT_3_12(P_X, P_A, P_B, P_C)
+#define REPEAT_3_14(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(D, P_A, P_B, P_C);          \
+  REPEAT_3_13(P_X, P_A, P_B, P_C)
+#define REPEAT_3_15(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(E, P_A, P_B, P_C);          \
+  REPEAT_3_14(P_X, P_A, P_B, P_C)
+#define REPEAT_3_16(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(F, P_A, P_B, P_C);          \
+  REPEAT_3_15(P_X, P_A, P_B, P_C)
+
+#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) \
+  REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) // One level of indirection to ensure order of expansion
+                                        // does not affect preprocessing P_NUM
+#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
+
+// Repeat macros with 4 param, excluding the implicit ID param
+#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D)
+#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(1, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_1(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(2, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_2(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(3, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_3(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(4, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_4(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(5, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_5(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(6, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_6(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(7, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_7(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(8, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_8(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(9, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_9(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(A, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_10(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(B, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_11(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(C, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_12(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(D, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_13(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(E, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_14(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(F, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_15(P_X, P_A, P_B, P_C, P_D)
+
+#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) \
+  REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) // One level of indirection to ensure order of
+                                             // expansion does not affect preprocessing P_NUM
+#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D)
+
+// Macro for initializing N variables. Generates N statements that defines VAR##N =
+// RHS_ACCESSOR_DEF(...)
+#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
+#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
+
+// Macro for initializing N variables by converting the data type. Generates N statements that
+// defines VAR##N = RHS_ACCESSOR_DEF(...)
+#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) \
+  TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
+#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) \
+  REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
+
+// Macro for adding a constant to N variables. Generates N statements that defines VAR##N
+// =RHS_ACCESSOR_DEF(...)
+#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
+#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL)
+
+// Macro for multiplying N variables (VAR_B) by a constant (VAL) and adding to other N variables
+// (VAR_A). Generates N statements that defines VAR_A##N =RHS_ACCESSOR_DEF(...)
+#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
+#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) \
+  REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL)
+
+// Macro for adding a vector to N-variables. Generates N statements that defines VAR##N
+// =RHS_ACCESSOR_DEF(...)
+#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC
+#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
+
+// Macro for adding a two N-variables. Generates N statements that defines VAR##N
+// =RHS_ACCESSOR_DEF(...)
+#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID
+#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
+
+// Macro for performing Max between a constant and N variables. Generates N statements that defines
+// VAR##N =RHS_ACCESSOR_DEF(...)
+#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
+#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL)
+
+// Macro for performing Min between a constant and N variables. Generates N statements that defines
+// VAR##N =RHS_ACCESSOR_DEF(...)
+#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
+#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL)
+
+// Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE to N variables. Generates N
+// statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+
+// Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE to N variables. Generates N
+// statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+
+// Macro for performing per-channel ASYMM_MULT_BY_QUANT_MULTIPLIER to N variables.
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  ({                                                                                      \
+    VEC_DATA_TYPE(int, N0)                                                                \
+    VAR##ID_shift_lt0 =                                                                   \
+      ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);   \
+    VEC_DATA_TYPE(int, N0)                                                                \
+    VAR##ID_shift_gt0 =                                                                   \
+      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);      \
+    VAR##ID = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0);               \
+  })
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
+
+#endif // ARM_COMPUTE_REPEAT_H
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl
new file mode 100644
index 000000000..8da8bfc8e
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform tensor reshape
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ * -DDATA_TYPE=short
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported
+ * data types: All
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension
+ * (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension
+ * (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension
+ * (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first
+ * source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported
+ * data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ * @param[in]  input_shape                          Input spatial shape
+ * @param[in]  output_shape                         Output spatial shape
+ */
+__kernel void reshape_layer(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output),
+                            int2 input_shape, int2 output_shape)
+{
+  Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+  int3 id = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+
+  // Linearize index
+  int linear_idx = id.x + id.y * input_shape.x + id.z * input_shape.x * input_shape.y;
+
+  // Translate to output
+  int3 out_id;
+  out_id.x = linear_idx % output_shape.x;
+  out_id.y = (linear_idx / output_shape.x) % output_shape.y;
+  out_id.z = linear_idx / (output_shape.x * output_shape.y);
+
+  // Store result
+  *((__global DATA_TYPE *)tensor3D_offset(&out, out_id.x, out_id.y, out_id.z)) =
+    *((__global DATA_TYPE *)in.ptr);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
index 45307fad7..987409739 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
@@ -39,16 +39,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
index ffa2c5a67..a5daa2410 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -43,6 +43,8 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/AccessWindowStatic.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
index 3f2ae357d..dc06bfbb3 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
@@ -41,13 +41,16 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 
 #include <cstddef>
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
index e4c617c8d..4206f1fd4 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -43,6 +43,9 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
index 8b5885225..62da2376e 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
@@ -45,6 +45,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 #include "arm_compute/core/UtilsEx.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
index f0a761b97..03ca6ddcb 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -43,6 +43,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
@@ -111,7 +112,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
   _hits = hits;
 
   // Make _lookup_indices tensor
-  _lookup_indices = support::cpp14::make_unique<CLTensor>();
+  _lookup_indices = std::make_unique<CLTensor>();
   _lookup_indices->allocator()->init(
     TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
   _lookup_indices->allocator()->allocate();
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
index dab6480b2..945af3c51 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
@@ -42,12 +42,16 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 #include "support/ToolchainSupport.h"
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp
new file mode 100644
index 000000000..a00fc5e2e
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+CLMemsetKernel::CLMemsetKernel() : ICLKernel(), _tensor(nullptr), _full_window() {}
+
+void CLMemsetKernel::configure(ICLTensor *tensor, const PixelValue &constant_value, Window *window)
+{
+  configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, window);
+}
+
+void CLMemsetKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor,
+                               const PixelValue &constant_value, Window *window)
+{
+  ARM_COMPUTE_UNUSED(compile_context);
+  ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+  ARM_COMPUTE_ERROR_THROW_ON(validate(tensor->info(), constant_value, window));
+
+  _tensor = tensor;
+
+  const DataType data_type = tensor->info()->data_type();
+  const int vec_size_x = 16 / tensor->info()->element_size();
+
+  // Create and update the window (if needed)
+  _full_window = calculate_max_window(*tensor->info());
+  Window win = _full_window;
+  if (window != nullptr)
+  {
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window);
+    win = *window;
+  }
+
+  const int output_width_x = win.num_iterations(0);
+  const bool multi_access_x = output_width_x >= vec_size_x;
+  const bool remainder_x = output_width_x % vec_size_x > 0;
+
+  if (multi_access_x)
+  {
+    win.set(
+      Window::DimX,
+      Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+  }
+  ICLKernel::configure_internal(win);
+
+  // Create kernel
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+  build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type));
+  build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+  build_opts.add_option_if(multi_access_x && remainder_x,
+                           "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+                                                    std::max<int>(output_width_x - vec_size_x, 0)));
+
+  _kernel =
+    static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("memset", build_opts.options()));
+}
+
+Status CLMemsetKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value,
+                                Window *window)
+{
+  ARM_COMPUTE_UNUSED(tensor);
+  ARM_COMPUTE_UNUSED(constant_value);
+  if (window != nullptr)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1);
+  }
+  return Status{};
+}
+
+void CLMemsetKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  // Collapse all the batches on the third
+  Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ);
+  Window slice = collapsed.first_slice_window_3D();
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _tensor, slice);
+    enqueue(queue, *this, slice, lws_hint());
+  } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
index 1d4b141a7..da7437e97 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
@@ -40,15 +40,19 @@
 
 #include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
index ee633d437..cd5e571e9 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -43,6 +43,9 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
index 0b8e7cc41..4c4cbe710 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
@@ -42,6 +42,10 @@
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 #include <string>
 namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp
new file mode 100644
index 000000000..b6efeac35
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h"
+
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_UNUSED(constant_value);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
+  ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > input->num_dimensions());
+  if (mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3);
+
+    const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
+    for (size_t i = 0; i < padding.size(); ++i)
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect));
+      ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect));
+    }
+  }
+
+  if (output->total_size() > 0)
+  {
+    TensorShape padded_shape =
+      misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
+  }
+
+  return Status{};
+}
+
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PaddingList &padding,
+                              PixelValue constant_value, PaddingMode mode,
+                              unsigned int &num_elems_processed_per_iteration)
+{
+  ARM_COMPUTE_UNUSED(constant_value, mode);
+
+  const TensorShape padded_shape =
+    misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
+  auto_init_if_empty(*output, input->clone()->set_tensor_shape(padded_shape));
+
+  num_elems_processed_per_iteration =
+    std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->data_type())));
+  if (input->dimension(0) < num_elems_processed_per_iteration)
+  {
+    num_elems_processed_per_iteration =
+      1 << static_cast<unsigned int>(std::log2(input->dimension(0)));
+  }
+
+  // Configure kernel window
+  Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+  const int input_start_x =
+    mode == PaddingMode::CONSTANT ? -(padding.at(0).first % num_elems_processed_per_iteration) : 0;
+  const int input_start_y =
+    (mode == PaddingMode::CONSTANT && padding.size() > 1) ? -padding.at(1).first : 0;
+
+  AccessWindowRectangle input_access(input, input_start_x, input_start_y,
+                                     num_elems_processed_per_iteration, 1);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  const bool window_changed = update_window_and_padding(win, input_access, output_access);
+  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+  Status err = (window_changed)
+                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                 : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLPadLayerKernelEx::CLPadLayerKernelEx()
+  : _input(nullptr), _output(nullptr), _input_start_x(0), _input_start_y(0), _4d_enabled(false)
+{
+}
+
+void CLPadLayerKernelEx::configure(const ICLTensor *input, ICLTensor *output,
+                                   const PaddingList &padding, PixelValue constant_value,
+                                   PaddingMode mode)
+{
+  configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value,
+            mode);
+}
+
+void CLPadLayerKernelEx::configure(const CLCompileContext &compile_context, const ICLTensor *input,
+                                   ICLTensor *output, const PaddingList &padding,
+                                   PixelValue constant_value, PaddingMode mode)
+{
+  ARM_COMPUTE_UNUSED(compile_context);
+  // Perform validation step
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+    validate_arguments(input->info(), output->info(), padding, constant_value, mode));
+
+  _input = input;
+  _output = output;
+  _4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3);
+
+  // Configure window
+  unsigned int vec_size;
+  auto win_config = validate_and_configure_window(input->info(), output->info(), padding,
+                                                  constant_value, mode, vec_size);
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+
+  // Set build options
+  std::string kernel_name = "pad_layer_";
+
+  const DataType &data_type = input->info()->data_type();
+  const unsigned int input_width = input->info()->dimension(0);
+  const unsigned int input_height = input->info()->dimension(1);
+  const unsigned int input_depth = input->info()->dimension(2);
+  const unsigned int pad_x_before = padding.at(0).first;
+  const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0;
+  const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0;
+  const unsigned int pad_right_start = input_width + pad_x_before;
+
+  _input_start_x = mode == PaddingMode::CONSTANT ? -(pad_x_before % vec_size) : 0;
+  _input_start_y = (mode == PaddingMode::CONSTANT && padding.size() > 1) ? -padding.at(1).first : 0;
+
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+  build_opts.add_option("-DSELECT_DT=" + get_cl_select_type_from_data_type(data_type));
+  build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+  build_opts.add_option("-DPAD_X_BEFORE=" + support::cpp11::to_string(pad_x_before));
+  build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
+  if (padding.size() > 1)
+  {
+    build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before));
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
+
+    if (padding.size() > 2)
+    {
+      build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before));
+      build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth));
+    }
+  }
+
+  switch (mode)
+  {
+    case PaddingMode::CONSTANT:
+    {
+      kernel_name += "constant";
+
+      build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type));
+      build_opts.add_option_if(pad_x_before >= vec_size,
+                               "-DNUM_THREADS_TO_SKIP_X=" +
+                                 support::cpp11::to_string(pad_x_before / vec_size));
+
+      if (_4d_enabled)
+      {
+        build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first));
+        build_opts.add_option("-DSRC_BATCH=" +
+                              support::cpp11::to_string(input->info()->dimension(3)));
+      }
+
+      break;
+    }
+    case PaddingMode::SYMMETRIC:
+    case PaddingMode::REFLECT:
+    {
+      kernel_name += "symmetric_reflect";
+
+      const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
+
+      const unsigned int pad_x_before_remainder = pad_x_before % vec_size;
+      const unsigned int pad_x_after_remainder = pad_right_start % vec_size;
+      const unsigned int after_pad_fact_x = (2 * input_width + pad_x_before) - is_reflect;
+      const unsigned int output_last_x =
+        ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
+
+      build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect));
+      build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" +
+                            support::cpp11::to_string(pad_x_before_remainder));
+      build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" +
+                            support::cpp11::to_string(pad_x_after_remainder));
+      build_opts.add_option(
+        "-DPAD_X_BEFORE_REMAINDER_REFL=" +
+        support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
+      build_opts.add_option(
+        "-DPAD_X_AFTER_REMAINDER_REFL=" +
+        support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
+      build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x));
+      build_opts.add_option_if(after_pad_fact_x < output_last_x,
+                               "-DAFTER_PAD_REM=" +
+                                 support::cpp11::to_string(after_pad_fact_x % vec_size));
+
+      break;
+    }
+    default:
+      ARM_COMPUTE_ERROR("Padding mode not supported.");
+  }
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+    CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
+}
+
+Status CLPadLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                    const PaddingList &padding, PixelValue constant_value,
+                                    PaddingMode mode)
+{
+  unsigned int vec_size;
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                            output->clone().get(), padding,
+                                                            constant_value, mode, vec_size)
+                                .first);
+
+  return Status{};
+}
+
+void CLPadLayerKernelEx::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  Window win_in = window;
+  win_in.adjust(Window::DimX, _input_start_x, true);
+  win_in.adjust(Window::DimY, _input_start_y, true);
+
+  Window slice_out = window.first_slice_window_3D();
+  Window slice_in = win_in.first_slice_window_3D();
+  unsigned int batch = 0;
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice_in);
+    add_3D_tensor_argument(idx, _output, slice_out);
+    if (_4d_enabled)
+    {
+      add_argument<unsigned int>(idx, batch++);
+    }
+
+    enqueue(queue, *this, slice_out, lws_hint());
+  } while (window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
index b417a7103..9aa815f55 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
@@ -40,15 +40,19 @@
 
 #include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
index 3906009c2..70374ba61 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -43,6 +43,9 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
index 4a6374444..c9d6dc31c 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
@@ -40,7 +40,7 @@
 
 #include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
+#include "src/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -48,6 +48,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 
 #include <climits>
diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
index c88bef6d7..1d4d33ac2 100644
--- a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
@@ -42,7 +42,7 @@
 
 #include <algorithm>
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEAsymm.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
index a8464afce..0551fc7db 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
@@ -43,10 +43,10 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <algorithm>
 #include <arm_neon.h>
@@ -163,7 +163,7 @@ void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out,
 
 std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func(
   const ITensor *input1, const ITensor *input2, ITensor *output,
-  std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function)
+  std::map<std::string, cpu::kernels::CpuElementwiseKernel::ElementwiseFunction *> map_function)
 {
   std::string function_to_call("op_");
   function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
@@ -185,9 +185,9 @@ template <BinaryLogicalOperation op>
 std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
 configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
-  static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = {
-    {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>},
-    {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}};
+  static std::map<std::string, cpu::kernels::CpuElementwiseKernel::ElementwiseFunction *>
+    map_function = {{"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>},
+                    {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}};
 
   return configure_func(input1, input2, output, map_function);
 }
@@ -196,7 +196,7 @@ void NEBinaryLogicalOperationKernel::configure(BinaryLogicalOperation op, const
                                                const ITensor *input2, ITensor *output)
 {
   ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
-  configure_common(input1, input2, output);
+  configure_common(input1->info(), input2->info(), output->info());
   switch (op)
   {
     case BinaryLogicalOperation::AND:
@@ -251,5 +251,4 @@ Status NEBinaryLogicalOperationKernel::validate(BinaryLogicalOperation op,
   ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
   return Status{};
 }
-
 } // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
index f935596e6..87e716b4f 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
@@ -39,16 +39,19 @@
  */
 #include "arm_compute/core/NEON/kernels/NECastBoolKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEMath.h"
+#include "src/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/SaturateCast.h"
+#include "support/SaturateCast.h"
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
+#include "src/core/NEON/INEKernel.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
index e3a77c6b1..3ad9ee945 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
@@ -47,6 +47,9 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 using namespace arm_compute;
 
 NEEmbeddingLookupKernel::NEEmbeddingLookupKernel()
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
new file mode 100644
index 000000000..375fa28e5
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <mutex>
+
+using namespace arm_compute;
+
+namespace
+{
+inline Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(accum);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+  ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != accum->dimension(0));
+
+  return Status{};
+}
+
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *accum,
+                                                               ITensorInfo *biases)
+{
+  constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+  // Configure kernel window
+  Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
+
+  bool window_changed = update_window_and_padding(
+    win, AccessWindowHorizontal(accum, 0, num_elems_processed_per_iteration),
+    AccessWindowStatic(biases, 0, 0,
+                       ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration),
+                       biases->tensor_shape().y()));
+
+  AccessWindowHorizontal output_access(accum, 0, num_elems_processed_per_iteration);
+
+  // Set the valid region for the accum tensor
+  Coordinates coord;
+  coord.set_num_dimensions(accum->num_dimensions());
+  output_access.set_valid_region(win, ValidRegion(coord, accum->tensor_shape()));
+
+  Status err = (window_changed)
+                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                 : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+NEGEMMMatrixAccumulateBiasesKernel::NEGEMMMatrixAccumulateBiasesKernel()
+  : _accum(nullptr), _biases(nullptr)
+{
+}
+
+void NEGEMMMatrixAccumulateBiasesKernel::configure(ITensor *accum, const ITensor *biases)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases);
+
+  // Perform validate step
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info()));
+
+  _biases = biases;
+  _accum = accum;
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(accum->info(), biases->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  INEKernel::configure(win_config.second);
+}
+
+Status NEGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum,
+                                                    const ITensorInfo *biases)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+    validate_and_configure_window(accum->clone().get(), biases->clone().get()).first);
+
+  return Status{};
+}
+
+std::mutex m;
+void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window, const ThreadInfo &info)
+{
+  std::lock_guard<std::mutex> lock_guard(m);
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  Window win_biases;
+  win_biases.set(Window::DimX,
+                 Window::Dimension(window.x().start(), window.x().end(), window.x().step()));
+  win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+  Iterator in0_out(_accum, window);
+  Iterator in1(_biases, win_biases);
+
+  switch (_accum->info()->data_type())
+  {
+    case DataType::F32:
+    {
+      execute_window_loop(
+        window,
+        [&](const Coordinates &) {
+          const float32x4x4_t accum = vld4q_f32(reinterpret_cast<const float *>(in0_out.ptr()));
+          const float32x4x4_t biases = vld4q_f32(reinterpret_cast<const float *>(in1.ptr()));
+          const float32x4x4_t res = {
+            {vaddq_f32(accum.val[0], biases.val[0]), vaddq_f32(accum.val[1], biases.val[1]),
+             vaddq_f32(accum.val[2], biases.val[2]), vaddq_f32(accum.val[3], biases.val[3])}};
+
+          vst4q_f32(reinterpret_cast<float *>(in0_out.ptr()), res);
+        },
+        in0_out, in1);
+      break;
+    }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    case DataType::F16:
+    {
+      execute_window_loop(
+        window,
+        [&](const Coordinates &) {
+          const float16x8x2_t accum = vld2q_f16(reinterpret_cast<const float16_t *>(in0_out.ptr()));
+          const float16x8x2_t biases = vld2q_f16(reinterpret_cast<const float16_t *>(in1.ptr()));
+          const float16x8x2_t res = {
+            {vaddq_f16(accum.val[0], biases.val[0]), vaddq_f16(accum.val[1], biases.val[1])}};
+
+          vst2q_f16(reinterpret_cast<float16_t *>(in0_out.ptr()), res);
+        },
+        in0_out, in1);
+      break;
+    }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    default:
+      ARM_COMPUTE_ERROR("Data type not supported");
+      break;
+  }
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
index c9f0799d4..d4144e6b9 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
@@ -40,7 +40,7 @@
 
 #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -50,6 +50,9 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 namespace arm_compute
 {
 namespace
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
index 52b40e767..f178865b7 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
@@ -47,6 +47,9 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include <unordered_map>
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
index 4dc0f5535..7804f9c6a 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
@@ -40,17 +40,22 @@
 
 #include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
index ad4728175..8ad998313 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
@@ -42,13 +42,15 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
index 0daff5c6a..e56fbf7f3 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
@@ -38,7 +38,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/NEON/kernels/NEOneHotKernel.h"
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -47,6 +47,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 namespace arm_compute
 {
 namespace
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
index 2306228d5..420e5063c 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
@@ -42,13 +42,16 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
index b02a48ef2..6b9b0d4b4 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
@@ -45,7 +45,9 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/Utils.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/runtime/Utils.h"
 
 namespace arm_compute
 {
@@ -66,7 +68,7 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT
                                   "Reduction axis greater than max number of dimensions");
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
   const unsigned int num_of_stages =
-    calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
+    utils::calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
 
   DataType output_data_type = DataType::S32;
   TensorInfo not_reshaped_output;
@@ -132,7 +134,7 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT
     ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(
       input, &sums_vector[last_stage - 1], &not_reshaped_output, axis, op));
   }
-  ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&not_reshaped_output, output));
+  ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&not_reshaped_output, output));
   return Status{};
 }
 
@@ -140,7 +142,7 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *
                                    const ReductionOperation &op)
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  _num_of_stages = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
+  _num_of_stages = utils::calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
   _reduction_axis = axis;
 
   const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(
@@ -204,7 +206,8 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *
                                                     &_not_reshaped_output, axis, op);
     _results_vector[last_stage - 1].allocator()->allocate();
   }
-  _reshape_kernel.configure(&_not_reshaped_output, output);
+  _reshape_kernel.configure(CLKernelLibrary::get().get_compile_context(), &_not_reshaped_output,
+                            output);
   _not_reshaped_output.allocator()->allocate();
 }
 
@@ -216,6 +219,6 @@ void CLArgMinMaxLayerEx::run()
   {
     CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
   }
-  CLScheduler::get().enqueue(_reshape_kernel, false);
+  _reshape_kernel.run();
 }
 } // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
index e5122ab8f..31c96b080 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
@@ -42,13 +42,14 @@
 
 #include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 using namespace arm_compute;
 
 void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
                                   BinaryLogicalOperation op)
 {
-  auto k = support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+  auto k = std::make_unique<CLBinaryLogicalOpKernel>();
   k->configure(input1, input2, output, op);
   _kernel = std::move(k);
 
@@ -57,7 +58,7 @@ void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTenso
     ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
     if (broadcasted_info->info()->dimension(0) == 1)
     {
-      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+      _border_handler->configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
     }
   }
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp
index c7d0ac8e2..96f9c17a9 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp
@@ -46,7 +46,7 @@ using namespace arm_compute;
 
 void CLCastBool::configure(ICLTensor *input, ICLTensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLCastBoolKernel>();
+  auto k = std::make_unique<CLCastBoolKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
index 6359b4bcb..464f60dee 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
@@ -45,6 +45,8 @@
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include <memory>
 #include <tuple>
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
index ae9d8afc6..003ec8042 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
@@ -39,7 +39,6 @@
  */
 
 #include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h"
-
 #include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
 
 using namespace arm_compute;
@@ -47,7 +46,7 @@ using namespace arm_compute;
 void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
                                   const ICLTensor *lookups)
 {
-  auto k = support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+  auto k = std::make_unique<CLEmbeddingLookupKernel>();
   k->configure(input, output, lookups);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
index 79d0929a9..af936e873 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
@@ -45,7 +45,6 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
 
 #include <algorithm>
 
@@ -68,7 +67,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
 void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
 {
-  auto k = support::cpp14::make_unique<CLTransposeKernel>();
+  auto k = std::make_unique<CLTransposeKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
index 13d3acbac..c6a88d340 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
@@ -42,11 +42,11 @@
 
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
+
+#include "support/Cast.h"
 
 #include <algorithm>
 
@@ -141,7 +141,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
 void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output)
 {
-  auto k = support::cpp14::make_unique<CLTransposeKernel>();
+  auto k = std::make_unique<CLTransposeKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
index ac6982e6f..cda784541 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
@@ -19,6 +19,7 @@
 #include <arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h>
 #include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
 #include <arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h>
+#include "src/core/helpers/AutoConfiguration.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp
new file mode 100644
index 000000000..cd7409417
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "support/StringSupport.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(accum);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+  ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() != 1);
+
+  return Status{};
+}
+
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *accum, ITensorInfo *biases, GPUTarget gpu_target,
+                              unsigned int &num_elems_processed_per_iteration)
+{
+  // Select the vector size to use (8 for Bifrost; 16 for Midgard).
+  bool is_gpu_bifrost =
+    gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, GPUTarget::G51,
+                     GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G52, GPUTarget::G52LIT);
+  num_elems_processed_per_iteration = is_gpu_bifrost ? 8 : 16;
+
+  // Configure kernel window
+  Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
+
+  AccessWindowStatic biases_access(
+    biases, 0, 0, ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration),
+    biases->dimension(1));
+  AccessWindowHorizontal accum_access(accum, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win, biases_access, accum_access);
+
+  Status err = (window_changed)
+                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                 : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel()
+  : _accum(nullptr), _biases(nullptr)
+{
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases)
+{
+  configure(CLKernelLibrary::get().get_compile_context(), accum, biases);
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::configure(const CLCompileContext &compile_context,
+                                                   ICLTensor *accum, const ICLTensor *biases)
+{
+  ARM_COMPUTE_UNUSED(compile_context);
+  // Perform validate step
+  ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info()));
+
+  _biases = biases;
+  _accum = accum;
+
+  // Get the target gpu
+  GPUTarget gpu_target = get_target();
+  unsigned int vector_size = 0;
+
+  // Configure kernel window
+  auto win_config =
+    validate_and_configure_window(accum->info(), biases->info(), gpu_target, vector_size);
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+
+  // Add build options
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(accum->info()->data_type()));
+  build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+    CLKernelLibraryEx::get().create_kernel("gemm_accumulate_biases", build_opts.options()));
+}
+
+Status CLGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum,
+                                                    const ITensorInfo *biases, GPUTarget gpu_target)
+{
+  unsigned int num_elems_processed_per_iteration = 0;
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(accum->clone().get(),
+                                                            biases->clone().get(), gpu_target,
+                                                            num_elems_processed_per_iteration)
+                                .first);
+
+  return Status{};
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+  Window accum_slice = window.first_slice_window_2D();
+
+  Window biases_slice(accum_slice);
+  biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+  // Run kernel
+  do
+  {
+    // Set arguments
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _accum, accum_slice);
+    add_1D_tensor_argument(idx, _biases, biases_slice);
+
+    enqueue(queue, *this, accum_slice, lws_hint());
+  } while (window.slide_window_slice_2D(accum_slice));
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
index e0b833b04..f380e3e2c 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
@@ -41,6 +41,8 @@
 #include "arm_compute/runtime/CL/functions/CLGatherEx.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/CL/kernels/CLGatherKernel.h"
+
 #include "arm_compute/core/CL/kernels/CLGatherExKernel.h"
 
 using namespace arm_compute;
@@ -48,7 +50,7 @@ using namespace arm_compute;
 void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output,
                            int axis)
 {
-  auto k = support::cpp14::make_unique<CLGatherExKernel>();
+  auto k = std::make_unique<CLGatherExKernel>();
   k->configure(input, indices, output, axis);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
index 65b89a389..9896abd4b 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
@@ -47,7 +47,7 @@ using namespace arm_compute;
 void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
                                   const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
 {
-  auto k = support::cpp14::make_unique<CLHashtableLookupKernel>();
+  auto k = std::make_unique<CLHashtableLookupKernel>();
   k->configure(lookups, keys, input, output, hits);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
index 5a7e40839..ca45a57f8 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
@@ -50,7 +50,7 @@ CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {}
 void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
                                                ICLTensor *gamma, ICLTensor *beta, float epsilon)
 {
-  auto k = support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
+  auto k = std::make_unique<CLInstanceNormalizationLayerKernelEx>();
   k->configure(input, output, gamma, beta, epsilon);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
index 28e5bc0da..2bdc451b3 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
@@ -46,7 +46,7 @@ using namespace arm_compute;
 
 void CLNeg::configure(ICLTensor *input, ICLTensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>();
+  auto k = std::make_unique<CLNegKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp
index aa9f32ec6..759a19ff3 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp
@@ -41,7 +41,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 CLOneHot::CLOneHot() : _memset_kernel(), _onehot_kernel(), _has_to_memset(false) {}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp
new file mode 100644
index 000000000..4d940e966
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLPadLayerEx.h"
+#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h"
+
+namespace arm_compute
+{
+CLPadLayerEx::CLPadLayerEx()
+  : _pad_kernel(std::make_unique<CLPadLayerKernelEx>()),
+    _copy_kernel(std::make_unique<opencl::kernels::ClCopyKernel>()), _perform_pad(false)
+{
+}
+
+void CLPadLayerEx::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding,
+                             PixelValue constant_value, PaddingMode mode)
+{
+  configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value,
+            mode);
+}
+
+void CLPadLayerEx::configure(const CLCompileContext &compile_context, ICLTensor *input,
+                             ICLTensor *output, const PaddingList &padding,
+                             PixelValue constant_value, PaddingMode mode)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(
+    validate(input->info(), output->info(), padding, constant_value, mode));
+
+  _perform_pad = std::any_of(padding.begin(), padding.end(),
+                             [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
+
+  if (_perform_pad)
+  {
+    _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode);
+  }
+  else
+  {
+    Window copy_window = Window();
+    copy_window.use_tensor_dimensions(output->info()->tensor_shape());
+    // Copy the input to the whole output if no padding is applied
+    _copy_kernel->configure(compile_context, input->info(), output->info(), &copy_window);
+  }
+}
+Status CLPadLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                              const PaddingList &padding, PixelValue constant_value,
+                              PaddingMode mode)
+{
+  bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) {
+    return info.first > 0 || info.second > 0;
+  });
+
+  if (perform_pad)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+      CLPadLayerKernelEx::validate(input, output, padding, constant_value, mode));
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClCopyKernel::validate(input, output));
+  }
+  return Status{};
+}
+void CLPadLayerEx::run()
+{
+  if (_perform_pad)
+  {
+    CLScheduler::get().enqueue(*_pad_kernel);
+  }
+  else
+  {
+    CLScheduler::get().enqueue(*_copy_kernel);
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
index c246041bb..6740835a8 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -61,7 +61,7 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *
   ARM_COMPUTE_RETURN_ERROR_ON(num_of_kernels < 1);
 
   // Create temporary tensor infos
-  auto interm_tensors = support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+  auto interm_tensors = std::make_unique<TensorInfo[]>(num_of_interm_tensors);
 
   // Create intermediate tensor info
   TensorShape shape{input->tensor_shape()};
@@ -124,8 +124,8 @@ void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
     throw std::runtime_error("CLReduceOperation: there is no axis to reduce");
   }
 
-  _interm_tensors = support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
-  _reduce_kernels = support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+  _interm_tensors = std::make_unique<CLTensor[]>(num_of_interm_tensors);
+  _reduce_kernels = std::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
 
   // Set a vector that is ordered ICLTensors sequentially.
   std::vector<ICLTensor *> tensors;
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
index 12c0aa829..73f5f6eb1 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
@@ -47,6 +47,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include <cassert>
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
index 0754fd813..f3f093c18 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
@@ -79,7 +79,7 @@ void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, IC
   {
     case DeconvolutionMethod::DIRECT:
     {
-      auto f = arm_compute::support::cpp14::make_unique<CLDirectTransposeConvLayer>();
+      auto f = std::make_unique<CLDirectTransposeConvLayer>();
       f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right,
                    invalid_bottom, weights_info);
       _function = std::move(f);
@@ -87,7 +87,7 @@ void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, IC
     }
     case DeconvolutionMethod::GEMM:
     {
-      auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
+      auto f = std::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
       f->configure(compile_context, input, weights, bias, output, deconv_info);
       _function = std::move(f);
       break;
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
index 2fc94b267..e6b7329d1 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
@@ -38,11 +38,10 @@
  * SOFTWARE.
  */
 
-#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h"
 #include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h>
+#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "support/MemorySupport.h"
 
 #include <utility>
 
@@ -53,7 +52,7 @@ template <BinaryLogicalOperation COP>
 void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2,
                                                     ITensor *output)
 {
-  auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+  auto k = std::make_unique<NEBinaryLogicalOperationKernel>();
   k->configure(COP, input1, input2, output);
   _kernel = std::move(k);
 }
@@ -69,7 +68,7 @@ Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1,
 void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output,
                                          BinaryLogicalOperation op)
 {
-  auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+  auto k = std::make_unique<NEBinaryLogicalOperationKernel>();
   k->configure(op, input1, input2, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp
index 6ad3e1b12..f6eec2603 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp
@@ -40,13 +40,12 @@
 #include "arm_compute/runtime/NEON/functions/NECastBool.h"
 
 #include "arm_compute/core/NEON/kernels/NECastBoolKernel.h"
-#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 void NECastBool::configure(const ITensor *input, ITensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NECastBoolKernel>();
+  auto k = std::make_unique<NECastBoolKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
index e0ab3e025..99fc5c579 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
@@ -41,13 +41,12 @@
 #include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h"
 
 #include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
-#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups)
 {
-  auto k = support::cpp14::make_unique<NEEmbeddingLookupKernel>();
+  auto k = std::make_unique<NEEmbeddingLookupKernel>();
   k->configure(input, output, lookups);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
index e212a03c7..fbd88fff0 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
@@ -66,7 +66,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
 void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
 {
-  auto k = support::cpp14::make_unique<NETransposeKernel>();
+  auto k = std::make_unique<NETransposeKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
index a639f2979..758f7dc59 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
@@ -50,7 +50,8 @@
 #include <algorithm>
 #include <cmath>
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
 
 namespace
@@ -164,9 +165,8 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei
                                         const ITensor *biases, ITensor *output,
                                         FullyConnectedLayerInfo fc_info)
 {
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
   // Perform validate step
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
   ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate(
     input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
     fc_info));
@@ -348,7 +348,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
        (input->dimension(0) * input->dimension(1) * input->dimension(2))));
 
     // Validate flatten kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayer::validate(input, &flatten_input));
     input_to_use = &flatten_input;
   }
   else
@@ -374,9 +374,13 @@ void NEFullyConnectedLayerEx::run()
   if (!_is_prepared)
   {
     if (!_are_weights_reshaped)
+    {
       _reshape_weights_output.allocator()->allocate();
+    }
     if (!_are_weights_converted)
+    {
       _converted_weights_output.allocator()->allocate();
+    }
     _is_prepared = true;
   }
 
@@ -407,7 +411,7 @@ void NEFullyConnectedLayerEx::run()
   // Linearize input if it comes from a convolutional layer
   if (_is_fc_after_conv)
   {
-    NEScheduler::get().schedule(&_flatten_kernel, Window::DimY);
+    _flatten_kernel.run();
   }
 
   // Run matrix multiply
@@ -490,3 +494,4 @@ void NEFullyConnectedLayerEx::prepare()
   }
 #endif
 }
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
index 234c783f9..2199839fb 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
@@ -19,6 +19,8 @@
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h>
+#include "src/core/helpers/AutoConfiguration.h"
+#include <cassert>
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
index 433c35d58..e5607ab9a 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
@@ -41,7 +41,6 @@
 #include "arm_compute/runtime/NEON/functions/NEGatherEx.h"
 
 #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
-#include "support/MemorySupport.h"
 
 #include <utility>
 
@@ -49,7 +48,7 @@ namespace arm_compute
 {
 void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
 {
-  auto k = support::cpp14::make_unique<NEGatherKernelEx>();
+  auto k = std::make_unique<NEGatherKernelEx>();
   k->configure(input, indices, output, axis);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
index 52d58accf..7cc6c89e7 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
@@ -41,14 +41,13 @@
 #include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h"
 
 #include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
-#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input,
                                   ITensor *output, ITensor *hits)
 {
-  auto k = support::cpp14::make_unique<NEHashtableLookupKernel>();
+  auto k = std::make_unique<NEHashtableLookupKernel>();
   k->configure(lookups, keys, input, output, hits);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp
index 275c55024..e0620bad2 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp
@@ -39,14 +39,14 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEOneHot.h"
 #include "arm_compute/core/NEON/kernels/NEOneHotKernel.h"
-#include "support/MemorySupport.h"
+
 #include <utility>
 namespace arm_compute
 {
 void NEOneHot::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
                          const ITensor *off_value, ITensor *output, int axis)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NEOneHotKernel>();
+  auto k = std::make_unique<NEOneHotKernel>();
   k->configure(indices, depth, on_value, off_value, output, axis);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
index c45c335b3..a30c00ea1 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
@@ -40,11 +40,13 @@
 
 #include "arm_compute/runtime/NEON/functions/NEReduceOperation.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/Tensor.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
index b21717e86..7a1342644 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
@@ -40,9 +40,13 @@
 
 #include "arm_compute/runtime/NEON/functions/NEReduceSum.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
index 50311071b..4675121b2 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
@@ -44,6 +44,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 using namespace arm_compute::misc::shape_calculator;
author	Chunseok Lee <chunseok.lee@samsung.com>	2021-04-20 18:01:41 +0900
committer	Chunseok Lee <chunseok.lee@samsung.com>	2021-04-20 18:01:41 +0900
commit	589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e (patch)
tree	47a2b23ce4220e3a4150c8b12ed941555272fb0c /compute/ARMComputeEx
parent	62529acabbafce7730601ed01d5709d7bc0d378a (diff)
download	nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.tar.gz nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.tar.bz2 nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.zip