Imported Upstream version 1.15.0upstream/1.15.0 submit/tizen/20210427.093759 submit/tizen/20210423.055448 submit/tizen/20210422.015846 submit/tizen/20210421.062230 accepted/tizen/unified/20210428.040443

author: Chunseok Lee <chunseok.lee@samsung.com> 2021-04-20 18:01:41 +0900
committer: Chunseok Lee <chunseok.lee@samsung.com> 2021-04-20 18:01:41 +0900
commit: 589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e (patch)
tree: 47a2b23ce4220e3a4150c8b12ed941555272fb0c /compute
parent: 62529acabbafce7730601ed01d5709d7bc0d378a (diff)
download: nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.tar.gz
nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.tar.bz2
nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.zip
146 files changed, 17438 insertions, 364 deletions
diff --git a/compute/.clang-format b/compute/.clang-format
deleted file mode 120000
index 0ff66f331..000000000
--- a/compute/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../.clang-format.8
-\ No newline at end of file
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
index 4a3717885..d3e116381 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
@@ -264,5 +264,5 @@ private:
     _program_source_map; /**< Contains sources for all programs.
                            Used for compile-time kernel inclusion. >*/
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
index a0aa0560b..46d4ae858 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
@@ -40,7 +40,7 @@
 #ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H
 #define ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
index bb6fcb8f5..eac866b67 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
@@ -41,8 +41,8 @@
 #ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
 #define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/TypesEx.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h
index ed668fd9c..cf671102e 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLCASTBOOLKERNEL_H__
 #define __ARM_COMPUTE_CLCASTBOOLKERNEL_H__
 
-#include "arm_compute/core/CL/ICLSimple3DKernel.h"
+#include "src/core/CL/ICLSimple3DKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
index fb689f747..6729fb0f1 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
 #define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
new file mode 100644
index 000000000..64908ab59
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H
+#define ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+/** Interface to add a bias to each row of the input tensor
+ *
+ */
+class CLGEMMMatrixAccumulateBiasesKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLGEMMMatrixAccumulateBiasesKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLGEMMMatrixAccumulateBiasesKernel(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLGEMMMatrixAccumulateBiasesKernel &
+  operator=(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLGEMMMatrixAccumulateBiasesKernel(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLGEMMMatrixAccumulateBiasesKernel &operator=(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
+  /** Set the accumulate buffer and the biases of the kernel.
+   *
+   * @param[in, out] accum  The accumulate tensor to convert. Data types supported: F16/F32
+   * @param[in]      biases The shared biases tensor to append. It must be 1D tensor. Data types
+   * supported: Same as @p input
+   */
+  void configure(ICLTensor *accum, const ICLTensor *biases);
+  /** Set the accumulate buffer and the biases of the kernel.
+   *
+   * @param[in]      compile_context The compile context to be used.
+   * @param[in, out] accum           The accumulate tensor to convert. Data types supported: F16/F32
+   * @param[in]      biases          The shared biases tensor to append. It must be 1D tensor. Data
+   * types supported: Same as @p input
+   */
+  void configure(const CLCompileContext &compile_context, ICLTensor *accum,
+                 const ICLTensor *biases);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLGEMMMatrixAccumulateBiasesKernel
+   *
+   * @param[in] accum      The accumulate tensor to convert. Data types supported: F16/F32
+   * @param[in] biases     The shared biases tensor to append. It must be 1D tensor. Data types
+   * supported: Same as @p input
+   * @param[in] gpu_target GPU target
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_accum;
+  const ICLTensor *_biases;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
index 6630c7be7..a55f2401d 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLGATHEREXKERNEL_H__
 #define __ARM_COMPUTE_CLGATHEREXKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
index 96f830898..f9d6f7cc5 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
 #define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
index f57e799ad..7da9e9a4c 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__
 #define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h
new file mode 100644
index 000000000..4befdd05c
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLMEMSETKERNEL_H
+#define ARM_COMPUTE_CLMEMSETKERNEL_H
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for filling the planes of a tensor */
+class CLMemsetKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLMemsetKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLMemsetKernel(const CLMemsetKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLMemsetKernel &operator=(const CLMemsetKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLMemsetKernel(CLMemsetKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLMemsetKernel &operator=(CLMemsetKernel &&) = default;
+  /** Default destructor */
+  ~CLMemsetKernel() = default;
+
+  /** Initialise the kernel's tensor and filling value
+   *
+   * @param[in,out] tensor         Input tensor to fill. Supported data types: All.
+   * @param[in]     constant_value The value used to fill the planes of the tensor
+   * @param[in]     window         Window to be used in case setting only part of a tensor. Default
+   * is nullptr.
+   */
+  void configure(ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr);
+  /** Initialise the kernel's tensor and filling value
+   *
+   * @param[in]     compile_context The compile context to be used.
+   * @param[in,out] tensor          Input tensor to fill. Supported data types: All.
+   * @param[in]     constant_value  The value used to fill the planes of the tensor
+   * @param[in]     window          Window to be used in case setting only part of a tensor. Default
+   * is nullptr.
+   */
+  void configure(const CLCompileContext &compile_context, ICLTensor *tensor,
+                 const PixelValue &constant_value, Window *window = nullptr);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLMemsetKernel
+   *
+   * @param[in] tensor         Source tensor info. Data types supported: All.
+   * @param[in] constant_value The value used to fill the planes of the tensor
+   * @param[in] window         Window to be used in case setting only part of a tensor. Default is
+   * nullptr.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value,
+                         Window *window = nullptr);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_tensor;
+  Window _full_window;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLMEMSETRKERNEL_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h
index 90e8b5705..5394a062c 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__
 #define __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
index fa383c0d0..384050aff 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_CLNEGKERNEL_H__
 #define __ARM_COMPUTE_CLNEGKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h
index a512057b9..1d64f9f7d 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h
@@ -39,7 +39,7 @@
  */
 #ifndef __ARM_COMPUTE_CLONEHOTKERNEL_H__
 #define __ARM_COMPUTE_CLONEHOTKERNEL_H__
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h
new file mode 100644
index 000000000..d4230aaf3
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLPADLAYERKERNELEX_H
+#define ARM_COMPUTE_CLPADLAYERKERNELEX_H
+
+#include "src/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the PadLayer function. */
+class CLPadLayerKernelEx : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLPadLayerKernelEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPadLayerKernelEx(const CLPadLayerKernelEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPadLayerKernelEx &operator=(const CLPadLayerKernelEx &) = delete;
+  /** Allow instances of this class to be moved */
+  CLPadLayerKernelEx(CLPadLayerKernelEx &&) = default;
+  /** Allow instances of this class to be moved */
+  CLPadLayerKernelEx &operator=(CLPadLayerKernelEx &&) = default;
+  /** Default destructor */
+  ~CLPadLayerKernelEx() = default;
+  /** Set the input and output tensor.
+   *
+   * @param[in]  input          Source tensor. Data types supported: U8, S8, QASYMM8,
+   * QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32.
+   * @param[out] output         Output tensor. Data type supported: same as @p input
+   * @param[in]  padding        The padding for each spatial dimension of the input tensor. The pair
+   * padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in]  constant_value (Optional) Constant value to be used for the padding.
+   * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding,
+                 PixelValue constant_value = PixelValue(),
+                 PaddingMode mode = PaddingMode::CONSTANT);
+  /** Set the input and output tensor.
+   *
+   * @param[in]  compile_context The compile context to be used.
+   * @param[in]  input           Source tensor. Data types supported: All.
+   * @param[out] output          Output tensor. Data type supported: same as @p input
+   * @param[in]  padding         The padding for each spatial dimension of the input tensor. The
+   * pair padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in]  constant_value  (Optional) Constant value to be used for the padding.
+   * @param[in]  mode            (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
+                 const PaddingList &padding, PixelValue constant_value = PixelValue(),
+                 PaddingMode mode = PaddingMode::CONSTANT);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLPadLayerKernelEx
+   *
+   * @param[in] input          Source tensor info. Data types supported: U8, S8, QASYMM8,
+   * QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32.
+   * @param[in] output         Output tensor info. Data type supported: same as @p input
+   * @param[in] padding        The padding for each spatial dimension of the input tensor. The pair
+   * padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in] constant_value (Optional) Constant value to be used for the padding.
+   * @param[in] mode           (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const PaddingList &padding, PixelValue constant_value = PixelValue(),
+                         PaddingMode mode = PaddingMode::CONSTANT);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+  int _input_start_x;
+  int _input_start_y;
+  bool _4d_enabled;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLPADLAYERKERNELEX_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h
index 4e1b56cba..3f60db7bb 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__
 #define __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
index 4f9042e41..548f29a27 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
 #define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h
index 4d4478ece..5f5b7f9b8 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__
 #define __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
index aa4a14812..09073af7c 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__
 #define __ARM_COMPUTE_CLTOPKV2KERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 // these parameters can be changed
 #define _ITEMS 16                          // number of items in a group
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
index 8c544cda8..c46b26170 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
@@ -41,15 +41,19 @@
 #ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__
 #define __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__
 
-#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
 #include "arm_compute/core/TypesEx.h"
 
+#include "src/core/cpu/kernels/CpuElementwiseKernel.h"
+
 namespace arm_compute
 {
 
-class NEBinaryLogicalOperationKernel : public NEElementwiseOperationKernel
+class NEBinaryLogicalOperationKernel : public cpu::kernels::CpuComparisonKernel
 {
 public:
+  const char *name() const override { return "NEBinaryLogicalOperationKernel"; }
+
+  NEBinaryLogicalOperationKernel() = default;
   /** Default destructor */
   ~NEBinaryLogicalOperationKernel() = default;
 
@@ -81,6 +85,10 @@ protected:
   // Inherited methods overridden:
   static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
                                    const ITensorInfo &output);
+
+  std::function<void(const ITensor *input1, const ITensor *input2, ITensor *output,
+                     const Window &window)>
+    _function;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h
index 101f6ac8e..036d56e69 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h
@@ -40,7 +40,7 @@
 #ifndef __ARM_COMPUTE_NECASTBOOLKERNEL_H__
 #define __ARM_COMPUTE_NECASTBOOLKERNEL_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
index 88f21c96e..621500eb8 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__
 #define __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
new file mode 100644
index 000000000..f8f7ac567
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H
+#define ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+/** NEON kernel to add a bias to each row of the input tensor */
+class NEGEMMMatrixAccumulateBiasesKernel : public INEKernel
+{
+public:
+  const char *name() const override { return "NEGEMMMatrixAccumulateBiasesKernel"; }
+  /** Default constructor */
+  NEGEMMMatrixAccumulateBiasesKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEGEMMMatrixAccumulateBiasesKernel(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEGEMMMatrixAccumulateBiasesKernel &
+  operator=(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  NEGEMMMatrixAccumulateBiasesKernel(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  NEGEMMMatrixAccumulateBiasesKernel &operator=(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
+  /** Default destructor */
+  ~NEGEMMMatrixAccumulateBiasesKernel() = default;
+  /** Set the accumulate buffer and the biases of the kernel.
+   *
+   * @param[in, out] accum  The accumulate tensor to convert. Data type supported: F32
+   * @param[in]      biases The shared biases tensor to append. It must be 1D Tensor. Data type
+   * supported: Same as @p input
+   */
+  void configure(ITensor *accum, const ITensor *biases);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEGEMMMatrixAccumulateBiasesKernel
+   *
+   * @param[in] accum  The accumulate tensor to convert. Data type supported: F32
+   * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type
+   * supported: Same as @p input
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *accum, const ITensorInfo *biases);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  ITensor *_accum;
+  const ITensor *_biases;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
index 5acfde5a8..a03e08ade 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEGATHERKERNELEX_H__
 #define __ARM_COMPUTE_NEGATHERKERNELEX_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
index cb2a485d5..fb3a72725 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__
 #define __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
index 8724cc69b..1d786b59e 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__
 #define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
index 198b0be9d..ab534fe96 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__
 #define __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
index 963d7b821..c1c9f7a3c 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
@@ -39,7 +39,7 @@
  */
 #ifndef __ARM_COMPUTE_NEONEHOTKERNEL_H__
 #define __ARM_COMPUTE_NEONEHOTKERNEL_H__
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
index 0b080cf73..1fd5362ae 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__
 #define __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
index d57e8fcf5..d7ec1b4f0 100644
--- a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
@@ -67,5 +67,5 @@ transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height,
                                 unsigned int kernel_width, unsigned int kernel_height,
                                 const PadStrideInfo &info, unsigned int invalid_right,
                                 unsigned int invalid_top);
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_UTILSEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
index 484ebfd0b..664b8b3b1 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
@@ -26,6 +26,7 @@
 #include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h>
 #include <arm_compute/runtime/CL/functions/CLNeg.h>
 #include <arm_compute/runtime/CL/functions/CLOneHot.h>
+#include <arm_compute/runtime/CL/functions/CLPadLayerEx.h>
 #include <arm_compute/runtime/CL/functions/CLReduceOperation.h>
 #include <arm_compute/runtime/CL/functions/CLSplitVEx.h>
 #include <arm_compute/runtime/CL/functions/CLTopKV2.h>
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
index b1ee52bf9..05bcc4075 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
@@ -41,8 +41,9 @@
 #define __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__
 
 #include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
@@ -100,7 +101,7 @@ private:
   std::vector<CLTensor> _results_vector;
   CLTensor _not_reshaped_output;
   std::vector<CLArgMinMaxLayerKernelEx> _reduction_kernels_vector;
-  CLReshapeLayerKernel _reshape_kernel;
+  CLReshapeLayer _reshape_kernel;
   unsigned int _num_of_stages;
   unsigned int _reduction_axis;
 };
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
index 88a9b00ec..fc4322798 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
@@ -43,6 +43,7 @@
 
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/core/TypesEx.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h
index d6150684a..854ddce52 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h
@@ -67,5 +67,5 @@ public:
    */
   void configure(ICLTensor *input, ICLTensor *output);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLCASTBOOL_H */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
index fbee7e40e..b0149cb09 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
@@ -73,5 +73,5 @@ public:
    */
   void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups);
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
index f3266f688..c75ae9a50 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
@@ -43,14 +43,14 @@
 
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h"
 #include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h"
 #include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h"
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
-#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
 
 namespace arm_compute
 {
@@ -182,5 +182,5 @@ private:
   bool _is_prepared;
   const ICLTensor *_original_weights;
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
index f27e9913e..c08da526a 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
@@ -43,16 +43,14 @@
 
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
 #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
 
 namespace arm_compute
 {
@@ -132,9 +130,6 @@ private:
  * transpose_weights is set to true ) (called once)
  *  -# @ref CLGEMMMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized
  * asymmetric)
- *  -# @ref CLGEMMMatrixAccumulateBiasesKernel or @ref
- * CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
- * not equal to nullptr)
  *
  * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
  */
@@ -157,40 +152,36 @@ public:
    * @param[in]  input   Source tensor. Data type supported: QASYMM8/F16/F32.
    * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
    *                     If this function is called after a Convolution Layer, the (transposed)
-   * weights will have as many rows as the product of the first 3 input's dimensions.
-   *                     If it is called after another FullyConnected Layer, the (transposed)
-   * weights will have as many rows as the input's first dimension.
-   *                     Data type supported: Same as @p input.
+   * weights will have as many rows as the product of the first 3 input's dimensions. If it is
+   * called after another FullyConnected Layer, the (transposed) weights will have as many rows as
+   * the input's first dimension. Data type supported: Same as @p input.
    * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported:Same as @p input.
    * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix
    * multiplication between:
    *                     - The output of im2col on the input and the (transposed) 2D weights, if the
    * function is called after a Convolution Layer
    *                     - The input tensor and the (transposed) 2D weights, if the function is
-   * called after another FullyConnected Layer.
-   *                     Data type supported: Same as @p input.
+   * called after another FullyConnected Layer. Data type supported: Same as @p input.
    * @param[in]  fc_info (Optional) Fully connected layer additional info
    */
   void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
                  ICLTensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
   /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLFullyConnectedLayerEx
+   * CLFullyConnectedLayer
    *
    * @param[in]  input   Source tensor info. Data type supported: QASYMM8/F16/F32.
    * @param[in]  weights Weights tensor info. The weights must be 2 dimensional.
    *                     If this function is called after a Convolution Layer, the (transposed)
-   * weights will have as many rows as the product of the first 3 input's dimensions.
-   *                     If it is called after another FullyConnected Layer, the (transposed)
-   * weights will have as many rows as the input's first dimension.
-   *                     Data type supported: Same as @p input.
+   * weights will have as many rows as the product of the first 3 input's dimensions. If it is
+   * called after another FullyConnected Layer, the (transposed) weights will have as many rows as
+   * the input's first dimension. Data type supported: Same as @p input.
    * @param[in]  biases  Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
    * @param[out] output  Destination tensor info. Its shape should be equal to the output of a
    * matrix multiplication between:
    *                     - The output of im2col on the input and the (transposed) 2D weights, if the
    * function is called after a Convolution Layer
    *                     - The input tensor and the (transposed) 2D weights, if the function is
-   * called after another FullyConnected Layer.
-   *                     Data type supported: Same as @p input.
+   * called after another FullyConnected Layer. Data type supported: Same as @p input.
    * @param[in]  fc_info (Optional) Fully connected layer additional info
    *
    * @return a status
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
index 167554c9e..385eb0b2c 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
@@ -47,11 +47,14 @@
 #ifndef __ARM_COMPUTE_CLGATHEREX_H__
 #define __ARM_COMPUTE_CLGATHEREX_H__
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /**
  * @brief Class to to run @ref CLGatherKernel.
@@ -81,5 +84,5 @@ public:
   static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
                          const ITensorInfo *output, int axis = 0);
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLGATHEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
index 6618f5aa4..5e172a4c7 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
@@ -78,5 +78,5 @@ public:
   void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput,
                  ICLTensor *output, ICLTensor *hits);
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
index 887e7aaa5..02ae6d719 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
@@ -41,11 +41,14 @@
 #ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__
 #define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to perform a Instance normalization.
  *
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h
index 2bbfca821..62a36f06d 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h
@@ -39,9 +39,11 @@
  */
 #ifndef __ARM_COMPUTE_CLONEHOT_H__
 #define __ARM_COMPUTE_CLONEHOT_H__
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+
 #include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/runtime/IFunction.h"
+
 namespace arm_compute
 {
 class ICLTensor;
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
new file mode 100644
index 000000000..ee1879aaa
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLPADLAYEREX_H
+#define ARM_COMPUTE_CLPADLAYEREX_H
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h"
+#include "src/core/gpu/cl/kernels/ClCopyKernel.h"
+// #include "arm_compute/runtime/CL/functions/CLCopy.h"
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to pad a tensor. This function calls the following OpenCL functions/kernels:
+ *
+ *  -# @ref CLPadLayerKernelEx if there is padding to be added
+ *  -# @ref CLCopyKernel otherwise
+ */
+class CLPadLayerEx : public IFunction
+{
+public:
+  /** Default constructor */
+  CLPadLayerEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPadLayerEx(const CLPadLayerEx &) = delete;
+  /** Default move constructor */
+  CLPadLayerEx(CLPadLayerEx &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPadLayerEx &operator=(const CLPadLayerEx &) = delete;
+  /** Default move assignment operator */
+  CLPadLayerEx &operator=(CLPadLayerEx &&) = default;
+
+  /** Initialize the function
+   *
+   * @param[in]  input          Source tensor. Data types supported: All.
+   * @param[out] output         Output tensor. Data type supported: same as @p input
+   * @param[in]  padding        The padding for each spatial dimension of the input tensor. The pair
+   * padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in]  constant_value (Optional) Constant value to be used for the padding.
+   * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding,
+                 PixelValue constant_value = PixelValue(),
+                 PaddingMode mode = PaddingMode::CONSTANT);
+  /** Initialize the function
+   *
+   * @param[in]  compile_context The compile context to be used.
+   * @param[in]  input           Source tensor. Data types supported: All.
+   * @param[out] output          Output tensor. Data type supported: same as @p input
+   * @param[in]  padding         The padding for each spatial dimension of the input tensor. The
+   * pair padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in]  constant_value  (Optional) Constant value to be used for the padding.
+   * @param[in]  mode            (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output,
+                 const PaddingList &padding, PixelValue constant_value = PixelValue(),
+                 PaddingMode mode = PaddingMode::CONSTANT);
+
+  /**  Static function to check if given info will lead to a valid configuration of @ref
+   * CLPadLayerEx.
+   *
+   * @param[in] input          Source tensor info. Data types supported: All.
+   * @param[in] output         Output tensor info. Data type supported: same as @p input
+   * @param[in] padding        The padding for each spatial dimension of the input tensor. The pair
+   * padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in] constant_value (Optional) Constant value to be used for the padding
+   * @param[in] mode           (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const PaddingList &padding, PixelValue constant_value = PixelValue(),
+                         PaddingMode mode = PaddingMode::CONSTANT);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  void configure_reflect_mode(ICLTensor *input, ICLTensor *output);
+
+  std::unique_ptr<CLPadLayerKernelEx> _pad_kernel;
+  std::unique_ptr<opencl::kernels::ClCopyKernel> _copy_kernel;
+  bool _perform_pad;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLPADLAYEREX_H */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
index bb852e404..45eb72bef 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
@@ -116,5 +116,5 @@ private:
   std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr};
   CLReshapeLayer _reshape;
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h
index bb741d98d..3023df3f0 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h
@@ -46,6 +46,9 @@
 #include <vector>
 #include <memory>
 
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/runtime/CPP/functions/CPPSplit.h"
+
 namespace arm_compute
 {
 class ICLTensor;
@@ -82,5 +85,5 @@ private:
   unsigned int _num_splits;
   std::vector<CLSlice> _slice_functions;
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLSPLITVEX__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
index e301a5152..f426a4d75 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
@@ -160,5 +160,5 @@ private:
   CLTopKV2Store _store_kernel;
 #endif
 };
-}
+} // namespace arm_compute
 #endif // __ARM_COMPUTE_CLTOPK_V2_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
index efc296d6c..d0ddc2609 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
@@ -16,7 +16,6 @@
 #ifndef __ARM_COMPUTE_NEFUNCTIONSEX_H__
 #define __ARM_COMPUTE_NEFUNCTIONSEX_H__
 
-#include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h>
 #include <arm_compute/runtime/NEON/functions/NECastBool.h>
 #include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
index 026d30098..8d931f08d 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
@@ -41,8 +41,10 @@
 #ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
 #define __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/TypesEx.h"
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/core/ITensorInfo.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h
index c8b08af8d..dd62645ee 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h
@@ -41,16 +41,17 @@
 #define __ARM_COMPUTE_NECASTBOOL_H__
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /**
- * @brief Class to run @ref NECastBoolKernel.
+ * @brief Class to run @ref INESimpleFunctionNoBorder.
  */
-class NECastBool : public INESimpleFunction
+class NECastBool : public INESimpleFunctionNoBorder
 {
 public:
   /** Initialize the function's source, destination
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
index 63f7714aa..82a789e86 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
@@ -48,12 +48,14 @@
 #define __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__
 
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/core/Error.h"
 
 #include <vector>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /**
  * @brief Class to perform EmbeddingLookup operation
@@ -84,5 +86,5 @@ public:
   static Status validate(const ITensorInfo *input, const ITensorInfo *output,
                          const ITensorInfo *lookups);
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
index 56548a479..214592710 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
@@ -44,11 +44,11 @@
 #include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "src/core/NEON/kernels/NETransposeKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
index 8f98f220a..2bbb1fea1 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
@@ -43,16 +43,16 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
-#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+#include "src/core/NEON/kernels/NETransposeKernel.h"
 
 namespace arm_compute
 {
@@ -79,11 +79,11 @@ public:
   /** Prevent instances of this class from being copied (As this class contains pointers) */
   NEFullyConnectedLayerEx(const NEFullyConnectedLayerEx &) = delete;
   /** Default move constructor */
-  NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = default;
+  NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = delete;
   /** Prevent instances of this class from being copied (As this class contains pointers) */
   NEFullyConnectedLayerEx &operator=(const NEFullyConnectedLayerEx &) = delete;
   /** Default move assignment operator */
-  NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = default;
+  NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = delete;
   /** Set the input and output tensors.
    *
    * @param[in]  input   Source tensor. Data type supported: QASYMM8/F16/F32.
@@ -141,7 +141,7 @@ private:
   void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
 
   MemoryGroup _memory_group;
-  NEFlattenLayerKernel _flatten_kernel;
+  NEFlattenLayer _flatten_kernel;
   NEConvertFullyConnectedWeights _convert_weights;
   NEFullyConnectedLayerReshapeWeights _reshape_weights_function;
   NEGEMM _mm_gemm;
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
index 155a1b837..6944c77f6 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
@@ -47,6 +47,7 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEGatherKernelEx */
 class NEGatherEx : public INESimpleFunctionNoBorder
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
index 521a05ad9..f6fda60a9 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
@@ -48,12 +48,14 @@
 #define __ARM_COMPUTE_NEHASHTABLELOOKUP_H__
 
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/core/Error.h"
 
 #include <vector>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /**
  * @brief Class to perform HashtableLookup operation
@@ -96,5 +98,5 @@ public:
                          const ITensorInfo *input, const ITensorInfo *output,
                          const ITensorInfo *hits);
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEHASHTABLELOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
index 18e813923..0ee967698 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
@@ -54,6 +54,7 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to perform a Instance normalization.
  *
@@ -112,5 +113,5 @@ private:
   Tensor _permuted_input;
   Tensor _permuted_output;
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
index 1a68f801a..668f024a1 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
@@ -45,6 +45,8 @@ namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
+
 /** Basic function to run @ref NEOneHotKernel */
 class NEOneHot : public INESimpleFunctionNoBorder
 {
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
index 91eec815c..9858e6c09 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
@@ -43,7 +43,7 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
index 48b416923..f34a8f8af 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
@@ -43,11 +43,13 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+#include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
index 7a08dae97..f82579a45 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
@@ -102,9 +102,9 @@ public:
   /** Prevent instances of this class from being copied (As this class contains pointers) */
   NETransposeConvLayer &operator=(const NETransposeConvLayer &) = delete;
   /** Allow instances of this class to be moved */
-  NETransposeConvLayer(NETransposeConvLayer &&) = default;
+  NETransposeConvLayer(NETransposeConvLayer &&) = delete;
   /** Allow instances of this class to be moved */
-  NETransposeConvLayer &operator=(NETransposeConvLayer &&) = default;
+  NETransposeConvLayer &operator=(NETransposeConvLayer &&) = delete;
   /** Default destructor */
   virtual ~NETransposeConvLayer() = default;
 
@@ -171,5 +171,5 @@ private:
   PadStrideInfo _info;
   bool _is_prepared;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 1a8ff3e71..1a180a35b 100644
--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -66,12 +66,16 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
   {"gather_ex_1d", "gather_ex.cl"},
   {"gather_ex_1d_out", "gather_ex.cl"},
   {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"},
+  {"gemm_accumulate_biases", "gemm.cl"},
   {"hashtable_lookup", "hashtable_lookup.cl"},
   {"instance_normalization_ex", "instance_normalization_ex.cl"},
+  {"memset", "memset.cl"},
   {"multiply_scale_factor", "multiply_scale_factor.cl"},
   {"neg_tensor", "neg_tensor.cl"},
   {"one_hot", "one_hot.cl"},
   {"one_hot_only_on_value", "one_hot.cl"},
+  {"pad_layer_constant", "pad_layer.cl"},
+  {"pad_layer_symmetric_reflect", "pad_layer.cl"},
   {"quantization_symm8", "quantization_symm8.cl"},
   {"reduce_min_max", "reduce_operation.cl"},
   {"reduce_sum_mean", "reduce_operation.cl"},
@@ -90,10 +94,18 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
 const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
 #ifdef EMBEDDED_KERNELS
   {
+    "activation_float_helpers.h",
+#include "./cl_kernels/activation_float_helpers.hembed"
+  },
+  {
     "arg_min_max_ex.cl",
 #include "./cl_kernels/arg_min_max_ex.clembed"
   },
   {
+    "binary_logical_op.cl",
+#include "./cl_kernels/binary_logical_op.clembed"
+  },
+  {
     "cast.cl",
 #include "./cl_kernels/cast.clembed"
   },
@@ -110,6 +122,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/gemmlowp_ex.clembed"
   },
   {
+    "gemm_helpers.h",
+#include "./cl_kernels/gemm_helpers.hembed"
+  },
+  {
     "hashtable_lookup.cl",
 #include "./cl_kernels/hashtable_lookup.clembed"
   },
@@ -126,8 +142,12 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/instance_normalization_ex.clembed"
   },
   {
-    "binary_logical_op.cl",
-#include "./cl_kernels/binary_logical_op.clembed"
+    "gemm.cl",
+#include "./cl_kernels/gemm.clembed"
+  },
+  {
+    "memset.cl",
+#include "./cl_kernels/memset.clembed"
   },
   {
     "multiply_scale_factor.cl",
@@ -142,6 +162,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/one_hot.clembed"
   },
   {
+    "pad_layer.cl",
+#include "./cl_kernels/pad_layer.clembed"
+  },
+  {
     "quantization_symm8.cl",
 #include "./cl_kernels/quantization_symm8.clembed"
   },
@@ -150,6 +174,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/reduce_operation.clembed"
   },
   {
+    "repeat.h",
+#include "./cl_kernels/repeat.hembed"
+  },
+  {
     "scale_factor.cl",
 #include "./cl_kernels/scale_factor.clembed"
   },
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h
new file mode 100644
index 000000000..3c3ff8419
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#if GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) (fma(c, b, a))
+#else // GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) ((b) * (c) + (a))
+#endif // GPU_ARCH == GPU_ARCH_BIFROST
+
+// Hard-Swish
+#define hard_swish_op(DATA_TYPE, x, A_VAL, B_VAL) \
+  (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
+
+// Logistic Activation
+#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
+
+// Hyperbolic Tangent Activation
+#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
+
+// RELU Tangent Activation
+#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
+
+// Bounded RELU Activation
+#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
+
+// Lower Upper Bounded RELU Activation
+#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+
+// Leaky RELU Activation
+#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) \
+  ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+
+// Soft RELU Activation
+#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
+
+// ELU Activation
+#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) \
+  (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0)))
+
+// Absolute Activation
+#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x))
+
+// Square Activation
+#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x)
+
+// Square-root Activation
+#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x))
+
+// Linear Activation
+#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
+
+// Identity Activation
+#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x)
+
+#define ACT_OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL)
+
+#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, x, A_VAL, B_VAL)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl
new file mode 100644
index 000000000..9b826a2bd
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl
@@ -0,0 +1,7210 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "repeat.h"
+
+#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
+#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
+#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
+#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
+#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+#define CONCAT_INC(K0) INC##K0
+#define INC(K0) CONCAT_INC(K0)
+
+#if (SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a)                                                                \
+  ({                                                                                              \
+    a = select(                                                                                   \
+      0, a,                                                                                       \
+      CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), \
+              VEC_DATA_TYPE(DATA_TYPE, K0)));                                                     \
+  })
+#else // (SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a) ({})
+#endif // (SRC_WIDTH % K0)
+
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks
+ * of size M0xK0 and stores each one (not transposed) in the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g.
+ * -DSRC_WIDTH=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g.
+ * -DM0=2, -DK0=2).
+ * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at
+ * compile time using -DV0 (e.g. -DV0=2)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,5,6,7,8
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer
+ * 1x1), the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile
+ * time.
+ *
+ * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_INPUT_AS_3D)
+ */
+__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                                      ,
+                                         uint cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+)
+{
+  // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+  // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+  // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+  // Compute source and destination addresses
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+  // ------------------ Compute input/output addresses ---------------------------
+
+  // Compute the input address
+  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes +
+                              x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
+
+  // Compute the output address
+  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+                               (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) +
+                               ((y / (uint)V0) * (uint)dst_stride_y) +
+                               ((y % V0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
+
+  // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src_stride_z by DEPTH_GEMM3D
+
+  input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
+
+  // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  input_ptr += z * (uint)src_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  output_ptr += z * (uint)dst_stride_z;
+
+  // ---------------------------Load input values --------------------------------
+  // Load values from the LHS matrix
+  LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+  BOUNDARY_CONDITION_X(x, a0);
+#if M0 > 1
+  BOUNDARY_CONDITION_X(x, a1);
+#endif // M0 > 1
+#if M0 > 2
+  BOUNDARY_CONDITION_X(x, a2);
+#endif // M0 > 2
+#if M0 > 3
+  BOUNDARY_CONDITION_X(x, a3);
+#endif // M0 > 3
+#if M0 > 4
+  BOUNDARY_CONDITION_X(x, a4);
+#endif // M0 > 4
+#if M0 > 5
+  BOUNDARY_CONDITION_X(x, a5);
+#endif // M0 > 5
+#if M0 > 6
+  BOUNDARY_CONDITION_X(x, a6);
+#endif // M0 > 6
+#if M0 > 7
+  BOUNDARY_CONDITION_X(x, a7);
+#endif // M0 > 7
+  // ---------------------------Store output values ------------------------------
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+  STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#if M0 == 2
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                              \
+  ({                                                                                          \
+    VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+    res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i);                                   \
+    VSTORE(M0)                                                                                \
+    (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+  })
+#elif M0 == 3 // M0 == 3
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                              \
+  ({                                                                                          \
+    VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+    res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i);                          \
+    VSTORE(M0)                                                                                \
+    (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+  })
+#elif M0 == 4 // M0 == 4
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                              \
+  ({                                                                                          \
+    VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+    res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                 \
+    VSTORE(M0)                                                                                \
+    (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+  })
+#elif M0 == 5 // M0 == 5
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
+  ({                                                                                              \
+    VEC_DATA_TYPE(DATA_TYPE, 4)                                                                   \
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                     \
+    DATA_TYPE res1 = a4.s##i;                                                                     \
+    VSTORE(4)                                                                                     \
+    (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));    \
+    *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
+  })
+#elif M0 == 6 // M0 == 6
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                   \
+  ({                                                                                               \
+    VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
+    VEC_DATA_TYPE(DATA_TYPE, 2)                                                                    \
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i);                                        \
+    VSTORE(4)                                                                                      \
+    (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
+    VSTORE(2)                                                                                      \
+    (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
+  })
+#elif M0 == 7 // M0 == 7
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                   \
+  ({                                                                                               \
+    VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
+    VEC_DATA_TYPE(DATA_TYPE, 3)                                                                    \
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i);                               \
+    VSTORE(4)                                                                                      \
+    (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
+    VSTORE(3)                                                                                      \
+    (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
+  })
+#elif M0 == 8 // M0 == 8
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                               \
+  ({                                                                                           \
+    VEC_DATA_TYPE(DATA_TYPE, M0)                                                               \
+    res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, \
+                                         a6.s##i, a7.s##i);                                    \
+    VSTORE(M0)                                                                                 \
+    (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));  \
+  })
+#else // M0 not supported
+#error "M0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks
+ * of size M0xK0 and stores each one (transposed) in the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g.
+ * -DSRC_WIDTH=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g.
+ * -DM0=2, -DK0=2).
+ * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at
+ * compile time using -DV0 (e.g. -DV0=2)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,5,6,7,8
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer
+ * 1x1), the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile
+ * time.
+ *
+ * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_INPUT_AS_3D)
+ */
+__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                                     ,
+                                        uint cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+)
+{
+  // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+  // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (M0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+  // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (M0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (M0)
+#endif // defined(INTERLEAVE)
+
+  // Compute source and destination addresses
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+  // ------------------ Compute input/output addresses ---------------------------
+
+  // Compute the input address
+  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes +
+                              x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
+
+  // Compute the output address
+  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+                               (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) +
+                               ((y / (uint)V0) * (uint)dst_stride_y) +
+                               ((y % V0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
+
+  // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src_stride_z by DEPTH_GEMM3D
+
+  input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
+
+  // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  input_ptr += z * (uint)src_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  output_ptr += z * (uint)dst_stride_z;
+
+  // ---------------------------Load input values --------------------------------
+
+  // Load values from the LHS matrix
+  LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+  BOUNDARY_CONDITION_X(x, a0);
+#if M0 > 1
+  BOUNDARY_CONDITION_X(x, a1);
+#endif // M0 > 1
+#if M0 > 2
+  BOUNDARY_CONDITION_X(x, a2);
+#endif // M0 > 2
+#if M0 > 3
+  BOUNDARY_CONDITION_X(x, a3);
+#endif // M0 > 3
+#if M0 > 4
+  BOUNDARY_CONDITION_X(x, a4);
+#endif // M0 > 4
+#if M0 > 5
+  BOUNDARY_CONDITION_X(x, a5);
+#endif // M0 > 5
+#if M0 > 6
+  BOUNDARY_CONDITION_X(x, a6);
+#endif // M0 > 6
+#if M0 > 7
+  BOUNDARY_CONDITION_X(x, a7);
+#endif // M0 > 7
+  // ---------------------------Transpose and store block -----------------------
+
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
+#if K0 > 2
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
+#endif // K0 > 2
+#if K0 > 3
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
+#endif // K0 > 3
+#if K0 > 4
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
+#endif // K0 > 4
+#if K0 > 8
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
+#endif // K0 > 8
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+
+#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks
+ * of size K0xN0 and stores each one (not transposed) in the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g.
+ * -DSRC_HEIGHT=16)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g.
+ * -DK0=2, -DN0=2).
+ * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at
+ * compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile
+ * time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 1,2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+  // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+  // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (N0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+  // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (N0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (N0)
+#endif // defined(INTERLEAVE)
+
+  // Compute source and destination addresses
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+  // ------------------ Compute input/output addresses ---------------------------
+
+  // Compute the input address
+  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes +
+                              x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y +
+                              z * (uint)src_stride_z;
+
+  // Compute the output address
+  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+                               (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) +
+                               ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) +
+                               ((x / (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
+
+  // ---------------------------Load input values --------------------------------
+
+  REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a,
+                           0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;
+
+  // Load values from the RHS matrix
+  a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+#if K0 > 1
+  if (y * (uint)K0 + 1 < SRC_HEIGHT)
+  {
+    a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+  }
+#endif // K0 > 1
+#if K0 > 2
+  if (y * (uint)K0 + 2 < SRC_HEIGHT)
+  {
+    a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+  }
+#endif // K0 > 2
+#if K0 > 3
+  if (y * (uint)K0 + 3 < SRC_HEIGHT)
+  {
+    a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
+  }
+#endif // K0 > 3
+#if K0 > 4
+  if (y * (uint)K0 + 4 < SRC_HEIGHT)
+  {
+    a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
+  }
+  if (y * (uint)K0 + 5 < SRC_HEIGHT)
+  {
+    a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
+  }
+  if (y * (uint)K0 + 6 < SRC_HEIGHT)
+  {
+    a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
+  }
+  if (y * (uint)K0 + 7 < SRC_HEIGHT)
+  {
+    a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
+  }
+#endif // K0 > 4
+#if K0 > 8
+  if (y * (uint)K0 + 8 < SRC_HEIGHT)
+  {
+    a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
+  }
+  if (y * (uint)K0 + 9 < SRC_HEIGHT)
+  {
+    a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
+  }
+  if (y * (uint)K0 + 10 < SRC_HEIGHT)
+  {
+    aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
+  }
+  if (y * (uint)K0 + 11 < SRC_HEIGHT)
+  {
+    aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
+  }
+  if (y * (uint)K0 + 12 < SRC_HEIGHT)
+  {
+    aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
+  }
+  if (y * (uint)K0 + 13 < SRC_HEIGHT)
+  {
+    aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
+  }
+  if (y * (uint)K0 + 14 < SRC_HEIGHT)
+  {
+    aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
+  }
+  if (y * (uint)K0 + 15 < SRC_HEIGHT)
+  {
+    aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
+  }
+#endif // K0 > 8
+
+  // ---------------------------Store output values ------------------------------
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+  STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#if defined(TRANSPOSE)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks
+ * of size K0xN0 and stores each one (transposed) in the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g.
+ * -DSRC_HEIGHT=16)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g.
+ * -DK0=2, -DN0=2).
+ * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at
+ * compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile
+ * time.
+ * @note The option -DTRANSPOSE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+  // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+  // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+  // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+  // Compute source and destination addresses
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+  // ------------------ Compute input/output addresses ---------------------------
+
+  // Compute the input address
+  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes +
+                              x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y +
+                              z * (uint)src_stride_z;
+
+  // Compute the output address
+  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+                               (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) +
+                               ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) +
+                               ((x / (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
+
+  // ---------------------------Load input values --------------------------------
+  REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a,
+                           0); // VEC_DATA_TYPE(DATA_TYPE, N0)    a0=0, a1=0, ... a(K0-1)=0;
+
+  // Load values from the RHS matrix
+  a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+  if (y * (uint)K0 + 1 < SRC_HEIGHT)
+  {
+    a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+  }
+#if K0 > 2
+  if (y * (uint)K0 + 2 < SRC_HEIGHT)
+  {
+    a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+  }
+#endif // K0 > 2
+#if K0 > 3
+  if (y * (uint)K0 + 3 < SRC_HEIGHT)
+  {
+    a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
+  }
+#endif // K0 > 3
+#if K0 > 4
+  if (y * (uint)K0 + 4 < SRC_HEIGHT)
+  {
+    a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
+  }
+  if (y * (uint)K0 + 5 < SRC_HEIGHT)
+  {
+    a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
+  }
+  if (y * (uint)K0 + 6 < SRC_HEIGHT)
+  {
+    a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
+  }
+  if (y * (uint)K0 + 7 < SRC_HEIGHT)
+  {
+    a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
+  }
+#endif // K0 > 4
+#if K0 > 8
+  if (y * (uint)K0 + 8 < SRC_HEIGHT)
+  {
+    a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
+  }
+  if (y * (uint)K0 + 9 < SRC_HEIGHT)
+  {
+    a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
+  }
+  if (y * (uint)K0 + 10 < SRC_HEIGHT)
+  {
+    aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
+  }
+  if (y * (uint)K0 + 11 < SRC_HEIGHT)
+  {
+    aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
+  }
+  if (y * (uint)K0 + 12 < SRC_HEIGHT)
+  {
+    aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
+  }
+  if (y * (uint)K0 + 13 < SRC_HEIGHT)
+  {
+    aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
+  }
+  if (y * (uint)K0 + 14 < SRC_HEIGHT)
+  {
+    aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
+  }
+  if (y * (uint)K0 + 15 < SRC_HEIGHT)
+  {
+    aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
+  }
+#endif // K0 > 8
+
+  // ---------------------------Transpose the block ------------------------------
+  REPEAT_VAR_INIT_TO_CONST(
+    N0, VEC_DATA_TYPE(DATA_TYPE, K0), res,
+    0); // VEC_DATA_TYPE(DATA_TYPE, K0)    res0=0, res1=0, res2=0,... res(N0-1)=0;
+
+#if K0 == 2
+  // This part computes the following transpositions:
+  // 2x2 -> 2x2
+  // 2x4 -> 4x2
+  // 2x8 -> 8x2
+  // 2x16 -> 16x2
+  res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);
+  res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);
+#if N0 > 2
+  res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);
+#endif // N0 > 2
+#if N0 > 3
+  res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);
+#endif // N0 > 3
+#if N0 > 4
+  res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);
+  res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);
+  res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);
+  res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);
+#endif // N0 > 4
+#if N0 > 8
+  res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);
+  res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);
+  resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);
+  resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);
+  resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);
+  resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);
+  resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);
+  resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);
+#endif // N0 > 8
+
+#elif K0 == 3 // K0 == 2
+  // This part computes the following transpositions:
+  // 3x2 -> 2x3
+  // 3x4 -> 4x3
+  // 3x8 -> 8x3
+  // 3x16 -> 16x3
+  res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);
+  res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);
+#if N0 > 2
+  res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);
+#endif // N0 > 2
+#if N0 > 3
+  res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);
+#endif // N0 > 3
+#if N0 > 4
+  res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);
+  res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);
+  res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);
+  res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);
+#endif // N0 > 4
+#if N0 > 8
+  res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);
+  res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);
+  resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);
+  resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);
+  resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);
+  resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);
+  resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);
+  resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);
+#endif // N0 > 8
+
+#elif K0 == 4 // K0 == 4
+  // This part computes the following transpositions:
+  // 4x2 -> 2x4
+  // 4x4 -> 4x4
+  // 4x8 -> 8x4
+  // 4x16 -> 16x4
+  res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);
+  res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);
+#if N0 > 2
+  res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);
+#endif // N0 > 2
+#if N0 > 3
+  res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);
+#endif // N0 > 3
+#if N0 > 4
+  res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);
+  res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);
+  res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);
+  res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);
+#endif // N0 > 4
+#if N0 > 8
+  res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);
+  res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);
+  resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);
+  resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);
+  resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);
+  resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);
+  resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);
+  resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);
+#endif // N0 > 8
+
+#elif K0 == 8 // K0 == 8
+  // This part computes the following transpositions:
+  // 8x2 -> 2x8
+  // 8x4 -> 4x8
+  // 8x8 -> 8x8
+  // 8x16 -> 16x8
+  res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);
+  res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);
+#if N0 > 2
+  res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);
+#endif // N0 > 2
+#if N0 > 3
+  res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);
+#endif // N0 > 3
+#if N0 > 4
+  res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);
+  res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);
+  res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);
+  res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);
+#endif // N0 > 4
+#if N0 > 8
+  res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);
+  res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);
+  resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);
+  resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);
+  resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);
+  resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);
+  resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);
+  resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);
+#endif // N0 > 8
+
+#elif K0 == 16 // K0 == 16
+
+  // This part computes the following transpositions:
+  // 16x2 -> 2x16
+  // 16x4 -> 4x16
+  // 16x8 -> 8x16
+  // 16x16 -> 16x16
+  res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,
+                                        a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);
+  res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,
+                                        a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);
+#if N0 > 2
+  res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,
+                                        a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);
+#endif // N0 > 2
+#if N0 > 3
+  res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,
+                                        a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);
+#endif // N0 > 3
+#if N0 > 4
+  res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,
+                                        a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);
+  res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,
+                                        a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);
+  res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,
+                                        a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);
+  res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,
+                                        a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);
+#endif // N0 > 4
+#if N0 > 8
+  res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,
+                                        a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);
+  res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,
+                                        a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);
+  resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,
+                                        a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);
+  resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,
+                                        a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);
+  resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,
+                                        a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);
+  resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,
+                                        a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);
+  resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,
+                                        a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);
+  resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,
+                                        a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);
+#endif // N0 > 8
+
+#else // N0 == 16
+#error "Not supported N0 value"
+#endif // N0 > 2
+
+  // ---------------------------Store the output values ------------------------------
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+  STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(TRANSPOSE)
+#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && \
+  defined(M) && defined(N) && defined(K)
+
+#define CONCAT(a, b) a##b
+
+#define ARM_DOT1(a, b, c) ({ c = fma(a, b, c); })
+#define ARM_DOT2(a, b, c)   \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+  })
+#define ARM_DOT3(a, b, c)       \
+  ({                            \
+    ARM_DOT2(a, b, c);          \
+    c = fma((a.s2), (b.s2), c); \
+  })
+#define ARM_DOT4(a, b, c)       \
+  ({                            \
+    ARM_DOT3(a, b, c);          \
+    c = fma((a.s3), (b.s3), c); \
+  })
+#define ARM_DOT8(a, b, c)        \
+  ({                             \
+    ARM_DOT4((a.lo), (b.lo), c); \
+    ARM_DOT4((a.hi), (b.hi), c); \
+  })
+#define ARM_DOT16(a, b, c)       \
+  ({                             \
+    ARM_DOT8((a.lo), (b.lo), c); \
+    ARM_DOT8((a.hi), (b.hi), c); \
+  })
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+  ({                               \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##0), (c.s0));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##1), (c.s1));         \
+  })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+  ({                               \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##0), (c.s0));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##1), (c.s1));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##2), (c.s2));         \
+  })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+  ({                               \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##0), (c.s0));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##1), (c.s1));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##2), (c.s2));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##3), (c.s3));         \
+  })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+  ({                               \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##0), (c.s0));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##1), (c.s1));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##2), (c.s2));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##3), (c.s3));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##4), (c.s4));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##5), (c.s5));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##6), (c.s6));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##7), (c.s7));         \
+  })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+  ({                               \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##0), (c.s0));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##1), (c.s1));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##2), (c.s2));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##3), (c.s3));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##4), (c.s4));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##5), (c.s5));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##6), (c.s6));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##7), (c.s7));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##8), (c.s8));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##9), (c.s9));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##A), (c.sA));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##B), (c.sB));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##C), (c.sC));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##D), (c.sD));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##E), (c.sE));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##F), (c.sF));         \
+  })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK
+ * (e.g. -DM=52, -DN=30 and -DK=90)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at
+ * compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type:
+ * F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS
+ * reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix
+ * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                          IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                          IMAGE_DECLARATION(dst), uint lhs_stride_z,
+                                          uint rhs_stride_z,
+#if defined(BETA)
+                                          uint bias_stride_z,
+#endif // defined(BETA)
+                                          uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                          ,
+                                          uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                          ,
+                                          uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS reshaped matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes +
+                    (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) +
+                    (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+  int i = 0;
+  for (; i <= (K - K0); i += K0)
+  {
+    // Supported cases (M0, K0):
+    // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+    // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+    // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+    // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+    // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+    // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+    // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+    // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS reshaped matrix
+    LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+    // Accumulate
+    ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+    ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+    ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+    ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+    ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+    ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+    ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+    ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+    lhs_offset += K0 * sizeof(DATA_TYPE);
+    rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+  }
+
+  // Left-over accumulations
+  for (; i < K; ++i)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS reshaped matrix
+    LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+    // Accumulate
+    ARM_DOT_K0XN0(1, a0, b, c0);
+#if M0 > 1
+    ARM_DOT_K0XN0(1, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+    ARM_DOT_K0XN0(1, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+    ARM_DOT_K0XN0(1, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+    ARM_DOT_K0XN0(1, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+    ARM_DOT_K0XN0(1, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+    ARM_DOT_K0XN0(1, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+    ARM_DOT_K0XN0(1, a7, b, c7);
+#endif // M0 > 7
+
+    lhs_offset += sizeof(DATA_TYPE);
+    rhs_offset += sizeof(DATA_TYPE);
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+                              (get_global_id(1) * (uint)M0 * bias_stride_y) +
+                              get_global_id(2) * bias_stride_z;
+
+  LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+  STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#define VFMA(a, b, c) ({ c = fma(a, b, c); })
+
+#if M0 == 1
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+  })
+#elif M0 == 2 // M0 == 2
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+  })
+#elif M0 == 3 // M0 == 3
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+  })
+#elif M0 == 4 // M0 == 4
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                \
+  })
+#elif M0 == 5 // M0 == 5
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                \
+  })
+#elif M0 == 6 // M0 == 6
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                \
+  })
+#elif M0 == 7 // M0 == 7
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6));                                \
+  })
+#elif M0 == 8 // M0 == 8
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7));                                \
+  })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK
+ * (e.g. -DM=52, -DN=30 and -DK=90).
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at
+ * compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type:
+ * F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS
+ * reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix
+ * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                           IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                           IMAGE_DECLARATION(dst), uint lhs_stride_z,
+                                           uint rhs_stride_z,
+#if defined(BETA)
+                                           uint bias_stride_z,
+#endif // defined(BETA)
+                                           uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                           ,
+                                           uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                           ,
+                                           uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS reshaped matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes +
+                    (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) +
+                    (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);   // uint zin0=0,zin1=0,zin2=0,... zin7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); // uint zero0=0,zero1=0,zero2=0,... zero7=0;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+
+  // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+  int i = 0;
+  for (; i <= (K - K0); i += K0)
+  {
+    // Supported cases (M0, K0):
+    // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+    // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+    // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+    // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+    // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+    // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+    // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+    // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
+
+    LD_RHS_VFMA_M0xN0(0, a, c);
+    LD_RHS_VFMA_M0xN0(1, a, c);
+#if K0 > 2
+    LD_RHS_VFMA_M0xN0(2, a, c);
+#endif // K0 > 2
+#if K0 > 3
+    LD_RHS_VFMA_M0xN0(3, a, c);
+#endif // K0 > 3
+#if K0 > 4
+    LD_RHS_VFMA_M0xN0(4, a, c);
+    LD_RHS_VFMA_M0xN0(5, a, c);
+    LD_RHS_VFMA_M0xN0(6, a, c);
+    LD_RHS_VFMA_M0xN0(7, a, c);
+#endif // K0 > 4
+#if K0 > 8
+    LD_RHS_VFMA_M0xN0(8, a, c);
+    LD_RHS_VFMA_M0xN0(9, a, c);
+    LD_RHS_VFMA_M0xN0(A, a, c);
+    LD_RHS_VFMA_M0xN0(B, a, c);
+    LD_RHS_VFMA_M0xN0(C, a, c);
+    LD_RHS_VFMA_M0xN0(D, a, c);
+    LD_RHS_VFMA_M0xN0(E, a, c);
+    LD_RHS_VFMA_M0xN0(F, a, c);
+#endif // K0 > 8
+
+    lhs_offset += K0 * sizeof(DATA_TYPE);
+    rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
+  }
+
+  // Left-over accumulations
+  for (; i < K; ++i)
+  {
+    // Load values from LHS matrix
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+    LD_RHS_VFMA_M0xN0(0, a, c);
+
+    lhs_offset += sizeof(DATA_TYPE);
+    rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+                              (get_global_id(1) * (uint)M0 * bias_stride_y) +
+                              get_global_id(2) * bias_stride_z;
+
+  LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+  STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) &&
+       // defined(M) && defined(N) && defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && \
+  defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
+
+#if defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c += a.s0 * b.s0;       \
+    c += a.s1 * b.s1;       \
+  })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c += a.s0 * b.s0;       \
+    c += a.s1 * b.s1;       \
+    c += a.s2 * b.s2;       \
+  })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c += a.s0 * b.s0;       \
+    c += a.s1 * b.s1;       \
+    c += a.s2 * b.s2;       \
+    c += a.s3 * b.s3;       \
+  })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c += a.s0 * b.s0;       \
+    c += a.s1 * b.s1;       \
+    c += a.s2 * b.s2;       \
+    c += a.s3 * b.s3;       \
+    c += a.s4 * b.s4;       \
+    c += a.s5 * b.s5;       \
+    c += a.s6 * b.s6;       \
+    c += a.s7 * b.s7;       \
+  })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c += a.s0 * b.s0;       \
+    c += a.s1 * b.s1;       \
+    c += a.s2 * b.s2;       \
+    c += a.s3 * b.s3;       \
+    c += a.s4 * b.s4;       \
+    c += a.s5 * b.s5;       \
+    c += a.s6 * b.s6;       \
+    c += a.s7 * b.s7;       \
+    c += a.s8 * b.s8;       \
+    c += a.s9 * b.s9;       \
+    c += a.sA * b.sA;       \
+    c += a.sB * b.sB;       \
+    c += a.sC * b.sC;       \
+    c += a.sD * b.sD;       \
+    c += a.sE * b.sE;       \
+    c += a.sF * b.sF;       \
+  })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#else  // defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+  })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+    c = fma(a.s2, b.s2, c); \
+  })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+    c = fma(a.s2, b.s2, c); \
+    c = fma(a.s3, b.s3, c); \
+  })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+    c = fma(a.s2, b.s2, c); \
+    c = fma(a.s3, b.s3, c); \
+    c = fma(a.s4, b.s4, c); \
+    c = fma(a.s5, b.s5, c); \
+    c = fma(a.s6, b.s6, c); \
+    c = fma(a.s7, b.s7, c); \
+  })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+    c = fma(a.s2, b.s2, c); \
+    c = fma(a.s3, b.s3, c); \
+    c = fma(a.s4, b.s4, c); \
+    c = fma(a.s5, b.s5, c); \
+    c = fma(a.s6, b.s6, c); \
+    c = fma(a.s7, b.s7, c); \
+    c = fma(a.s8, b.s8, c); \
+    c = fma(a.s9, b.s9, c); \
+    c = fma(a.sA, b.sA, c); \
+    c = fma(a.sB, b.sB, c); \
+    c = fma(a.sC, b.sC, c); \
+    c = fma(a.sD, b.sD, c); \
+    c = fma(a.sE, b.sE, c); \
+    c = fma(a.sF, b.sF, c); \
+  })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#endif // defined(MIXED_PRECISION)
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(a, b, c)       \
+  ({                                 \
+    ARM_DOT_K0((a), (b##0), (c.s0)); \
+    ARM_DOT_K0((a), (b##1), (c.s1)); \
+  })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(a, b, c)       \
+  ({                                 \
+    ARM_DOT_K0((a), (b##0), (c.s0)); \
+    ARM_DOT_K0((a), (b##1), (c.s1)); \
+    ARM_DOT_K0((a), (b##2), (c.s2)); \
+  })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(a, b, c)       \
+  ({                                 \
+    ARM_DOT_K0((a), (b##0), (c.s0)); \
+    ARM_DOT_K0((a), (b##1), (c.s1)); \
+    ARM_DOT_K0((a), (b##2), (c.s2)); \
+    ARM_DOT_K0((a), (b##3), (c.s3)); \
+  })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(a, b, c)       \
+  ({                                 \
+    ARM_DOT_K0((a), (b##0), (c.s0)); \
+    ARM_DOT_K0((a), (b##1), (c.s1)); \
+    ARM_DOT_K0((a), (b##2), (c.s2)); \
+    ARM_DOT_K0((a), (b##3), (c.s3)); \
+    ARM_DOT_K0((a), (b##4), (c.s4)); \
+    ARM_DOT_K0((a), (b##5), (c.s5)); \
+    ARM_DOT_K0((a), (b##6), (c.s6)); \
+    ARM_DOT_K0((a), (b##7), (c.s7)); \
+  })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(a, b, c)       \
+  ({                                 \
+    ARM_DOT_K0((a), (b##0), (c.s0)); \
+    ARM_DOT_K0((a), (b##1), (c.s1)); \
+    ARM_DOT_K0((a), (b##2), (c.s2)); \
+    ARM_DOT_K0((a), (b##3), (c.s3)); \
+    ARM_DOT_K0((a), (b##4), (c.s4)); \
+    ARM_DOT_K0((a), (b##5), (c.s5)); \
+    ARM_DOT_K0((a), (b##6), (c.s6)); \
+    ARM_DOT_K0((a), (b##7), (c.s7)); \
+    ARM_DOT_K0((a), (b##8), (c.s8)); \
+    ARM_DOT_K0((a), (b##9), (c.s9)); \
+    ARM_DOT_K0((a), (b##A), (c.sA)); \
+    ARM_DOT_K0((a), (b##B), (c.sB)); \
+    ARM_DOT_K0((a), (b##C), (c.sC)); \
+    ARM_DOT_K0((a), (b##D), (c.sD)); \
+    ARM_DOT_K0((a), (b##E), (c.sE)); \
+    ARM_DOT_K0((a), (b##F), (c.sF)); \
+  })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT
+ * transposed The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0
+ * must be transposed
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The data type used for the accumulators must be passed at compile time using
+ * -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
+ * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION
+ * passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52
+ * and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0)
+ * must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS
+ * matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option
+ * -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data
+ * type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS
+ * reshaped matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS
+ * reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS
+ * matrix not reshaped.
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                            IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                            IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z,
+                                            uint rhs_stride_z,
+#if defined(BETA)
+                                            uint bias_stride_z,
+#endif // defined(BETA)
+                                            uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                            ,
+                                            uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes +
+                             (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) +
+                             (get_global_id(1) / V0) * (uint)lhs_stride_y +
+                             (get_global_id(2) * lhs_stride_z);
+
+  // Compute RHS matrix address
+  __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes +
+                             (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) +
+                             (get_global_id(0) / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_addr += get_global_id(2) * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); // uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+  for (int i = 0; i < k; i += K0)
+  {
+    // Supported cases (M0, K0):
+    // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+    // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+    // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+    // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+    // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+    // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+    // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+    // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+    // Accumulate
+    ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+    ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+    ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+    ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+    ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+    ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+    ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+    ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+    lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
+    rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+                             (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D,
+                     dst_cross_plane_pad, dst_stride_y);
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+  CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+  ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+  ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+                              (get_global_id(1) * (uint)M0 * bias_stride_y) +
+                              get_global_id(2) * bias_stride_z;
+
+  LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+#if defined(MIXED_PRECISION)
+  CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+  ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+  ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+#if defined(MIXED_PRECISION)
+  CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#else  // defined(MIXED_PRECISION)
+  STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#if defined(LHS_TRANSPOSE)
+
+#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
+
+#if defined(MIXED_PRECISION)
+
+#if (GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c)                                   \
+  c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * \
+       (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c)                                     \
+  c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), \
+          (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#else // defined(MIXED_PRECISION
+
+#if (GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#endif // defined(MIXED_PRECISION)
+
+#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) ({ ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); })
+#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C)        \
+  ({                                                  \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
+  })
+#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C)        \
+  ({                                                  \
+    ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C);           \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
+  })
+#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C)        \
+  ({                                                  \
+    ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C);           \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
+  })
+#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C)        \
+  ({                                                  \
+    ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C);           \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
+  })
+
+// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication.
+// K0 = 1 a is the column-vector (transposed) b is the row-vector (not transposed) C is the output
+// matrix Lower case is a vector (a, b) Upper case is a matrix (C)
+#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
+
+#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \
+  ({ ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); })
+#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C)         \
+  ({                                                       \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C);            \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
+  })
+#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C)         \
+  ({                                                       \
+    ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C);            \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
+  })
+#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C)         \
+  ({                                                       \
+    ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C);            \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
+  })
+#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C)         \
+  ({                                                       \
+    ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C);            \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
+  })
+#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C)       \
+  ({                                                      \
+    ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C);           \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
+  })
+
+// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
+// The dimensions for this matrix multiplications are defined through M0, N0 and K0
+// The dimensions supported are:
+// M0: 1, 2, 3, 4, 8
+// N0: 1, 2, 3, 4, 8, 16
+// K0: 1, 2, 3, 4, 8, 16
+// This macro calls the vector-by-matrix macro K0 times
+// A, B and C are matrices
+#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
+  CONCAT(ARM_MM_T_NT_M0xN0x, K0)               \
+  (M0, N0, TYPE, A, B, C)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be
+ * transposed The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0
+ * must be NOT transposed
+ *
+ * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g.
+ * -DLHS_TRANSPOSE).
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52
+ * and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0)
+ * must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS
+ * matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option
+ * -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data
+ * type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS
+ * reshaped matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS
+ * reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS
+ * matrix not reshaped.
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                            IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                            IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z,
+                                            uint rhs_stride_z,
+#if defined(BETA)
+                                            uint bias_stride_z,
+#endif // defined(BETA)
+                                            uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                            ,
+                                            uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (M0)
+#define LHS_STEP_X ((M0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (M0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#endif // defined(RHS_INTERLEAVE)
+
+  const uint x = get_global_id(0);
+  const uint y = get_global_id(1);
+  const uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes +
+                             (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) +
+                             (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
+
+  // Compute RHS matrix address
+  __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes +
+                             (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) +
+                             (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_addr += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+  __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
+  __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
+
+  for (int i = 0; i < k; i += K0)
+  {
+    VEC_DATA_TYPE(DATA_TYPE, M0)
+    a0 = VLOAD(M0)(0, lhs);
+    VEC_DATA_TYPE(DATA_TYPE, N0)
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+#if K0 > 1
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+#endif // K0 > 1
+
+#if K0 > 2
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+#endif // K0 > 2
+
+#if K0 > 3
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+#endif // K0 > 3
+
+#if K0 > 4
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+#endif // K0 > 4
+
+#if K0 > 8
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+#endif // K0 > 8
+
+#ifndef LHS_INTERLEAVE
+    lhs += (M0 * K0 * (V0 - 1));
+#endif // LHS_INTERLEAVE
+
+#ifndef RHS_INTERLEAVE
+    rhs += (N0 * K0 * (H0 - 1));
+#endif // RHS_INTERLEAVE
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+  __global uchar *bias_addr =
+    bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+  CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+  ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+  ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) +
+                              z * bias_stride_z;
+
+  LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+#if defined(MIXED_PRECISION)
+  CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+  ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+  ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+#if defined(MIXED_PRECISION)
+  CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#else  // defined(MIXED_PRECISION)
+  STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#endif // defined(LHS_TRANSPOSE)
+
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) &&
+       // defined(DATA_TYPE)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
+
+#define VFMA(a, b, c) ({ c = fma(a, b, c); })
+
+#if M0 == 1
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+  ({ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); })
+#elif M0 == 2 // M0 == 2
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+  })
+#elif M0 == 3 // M0 == 3
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+  })
+#elif M0 == 4 // M0 == 4
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+  })
+#elif M0 == 5 // M0 == 5
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+  })
+#elif M0 == 6 // M0 == 6
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+  })
+#elif M0 == 7 // M0 == 7
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+  })
+#elif M0 == 8 // M0 == 8
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
+  })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is NOT reshaped
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK
+ * (e.g. -DM=52, -DN=30 and -DK=90)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g.,
+ * -DK0=2)
+ * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type:
+ * F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         lhs_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         lhs_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS matrix. Supported data type:
+ * same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         rhs_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         rhs_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix
+ * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                             IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                             IMAGE_DECLARATION(dst), uint lhs_stride_z, uint rhs_stride_z,
+#if defined(BETA)
+                             uint bias_stride_z,
+#endif // defined(BETA)
+                             uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                             ,
+                             uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                             ,
+                             uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+  int i = 0;
+  for (; i <= (K - K0); i += K0)
+  {
+    // Supported cases (M0, K0):
+    // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+    // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+    // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+    // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+    // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+    // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+    // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+    // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
+
+    RHS_VFMA_M0xN0(0, a, b0, c);
+    RHS_VFMA_M0xN0(1, a, b1, c);
+#if K0 > 2
+    RHS_VFMA_M0xN0(2, a, b2, c);
+#endif // K0 > 2
+#if K0 > 3
+    RHS_VFMA_M0xN0(3, a, b3, c);
+#endif // K0 > 3
+#if K0 > 4
+    RHS_VFMA_M0xN0(4, a, b4, c);
+    RHS_VFMA_M0xN0(5, a, b5, c);
+    RHS_VFMA_M0xN0(6, a, b6, c);
+    RHS_VFMA_M0xN0(7, a, b7, c);
+#endif // K0 > 4
+#if K0 > 8
+    RHS_VFMA_M0xN0(8, a, b8, c);
+    RHS_VFMA_M0xN0(9, a, b9, c);
+    RHS_VFMA_M0xN0(A, a, bA, c);
+    RHS_VFMA_M0xN0(B, a, bB, c);
+    RHS_VFMA_M0xN0(C, a, bC, c);
+    RHS_VFMA_M0xN0(D, a, bD, c);
+    RHS_VFMA_M0xN0(E, a, bE, c);
+    RHS_VFMA_M0xN0(F, a, bF, c);
+#endif // K0 > 8
+
+    lhs_offset += K0 * sizeof(DATA_TYPE);
+    rhs_offset += K0 * rhs_stride_y;
+  }
+
+  // Left-over accumulations
+  for (; i < K; ++i)
+  {
+    // Load values from LHS matrix
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
+#if M0 > 1
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
+#endif // M0 > 1
+#if M0 > 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
+#endif // M0 > 2
+#if M0 > 3
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
+#endif // M0 > 3
+#if M0 > 4
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
+#endif // M0 > 4
+#if M0 > 5
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
+#endif // M0 > 5
+#if M0 > 6
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
+#endif // M0 > 6
+#if M0 > 7
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
+#endif // M0 > 7
+
+    VEC_DATA_TYPE(DATA_TYPE, N0)
+    b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
+    RHS_VFMA_M0xN0(0, a, b, c);
+
+    lhs_offset += sizeof(DATA_TYPE);
+    rhs_offset += rhs_stride_y;
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+                              (get_global_id(1) * (uint)M0 * bias_stride_y) +
+                              get_global_id(2) * bias_stride_z;
+
+  LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+  STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
+
+#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
+/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between
+ * matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif // defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+  int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+  int z = get_global_id(2);
+
+  // Offset
+  const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+  const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
+
+  // src_addr_a = address of matrix A
+  // src_addr_b = address of matrix B
+  int src0_addr_in_bytes =
+    z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+  int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+  __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
+
+  // Compute end row address for matrix B
+  __global float *src_end_addr_b = src_addr_b + COLS_B;
+
+  src_addr_a += offset_row_a;
+  src_addr_b += offset_row_b;
+
+  // Reset accumulators
+  float4 c0 = 0.0f;
+  float4 c1 = 0.0f;
+  float4 c2 = 0.0f;
+  float4 c3 = 0.0f;
+
+  for (; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH));
+       src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = vload4(0, src_addr_a);
+    float4 b0 = vload4(0, src_addr_b);
+
+    c0 += (float4)a0.s0 * b0;
+    c1 += (float4)a0.s1 * b0;
+    c2 += (float4)a0.s2 * b0;
+    c3 += (float4)a0.s3 * b0;
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+    b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
+
+    c0 += (float4)a0.s0 * b0;
+    c1 += (float4)a0.s1 * b0;
+    c2 += (float4)a0.s2 * b0;
+    c3 += (float4)a0.s3 * b0;
+  }
+
+  for (; src_addr_b < src_end_addr_b;
+       src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = vload4(0, src_addr_a);
+    float4 b0 = vload4(0, src_addr_b);
+
+    c0 += (float4)a0.s0 * b0;
+    c1 += (float4)a0.s1 * b0;
+    c2 += (float4)a0.s2 * b0;
+    c3 += (float4)a0.s3 * b0;
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+  LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) +
+    (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(4, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store 4x4 block
+  vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+  vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+  vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+  vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between
+ * matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
+                                                         IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                         IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                         IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                         uint src1_stride_z,
+#if defined(BETA)
+                                                         uint src2_stride_z,
+#endif // defined(BETA)
+                                                         uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                         ,
+                                                         uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+  int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+  int z = get_global_id(2);
+
+  // Offset
+  const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+  const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
+
+  // src_addr_a = address of matrix A
+  // src_addr_b = address of matrix B
+  int src0_addr_in_bytes =
+    z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+  int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+  __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
+
+  src_addr_a += offset_row_a;
+  src_addr_b += offset_row_b;
+
+  // Reset accumulators
+  float4 c0 = 0.0f;
+  float4 c1 = 0.0f;
+  float4 c2 = 0.0f;
+  float4 c3 = 0.0f;
+
+#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))
+
+  int i = 0;
+  for (; i <= (int)(COLS_MTX_B - 4); i += 4)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = vload4(0, src_addr_a);
+    float4 b0 = vload4(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+    c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+    c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+    c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+    c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+    c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+    c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+    c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+    c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+    c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+    c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+    c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+    c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+    c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+    c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+    c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload4(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+    c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+    c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+    c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+    c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+    c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+    c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+    c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+    c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+    c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+    c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+    c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+    c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+    c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+    c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+    c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload4(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+    c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+    c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+    c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+    c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+    c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+    c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+    c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+    c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+    c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+    c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+    c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+    c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+    c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+    c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+    c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload4(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+    c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+    c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+    c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+    c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+    c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+    c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+    c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+    c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+    c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+    c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+    c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+    c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+    c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+    c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+    c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+  }
+
+  for (; i < (int)(COLS_MTX_B); ++i)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = vload4(0, src_addr_a);
+    float4 b0 = vload4(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+    c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+    c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+    c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+    c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+    c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+    c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+    c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+    c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+    c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+    c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+    c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+    c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+    c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+    c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+    c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+  LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) +
+    (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(4, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store 4x4 block
+  vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+  vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+  vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+  vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+// Undefine local defines
+#undef COLS_MTX_B
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and
+ * matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif // defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+  int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+  int z = get_global_id(2);
+
+  // Offset
+  const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+  const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+  // src_addr_a = address of matrix A
+  // src_addr_b = address of matrix B
+  int src0_addr_in_bytes =
+    z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+  int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+  __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+  // Compute end row address for matrix B
+  __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+  src_addr_a += offset_row_a;
+  src_addr_b += offset_row_b;
+
+  // Reset accumulators
+  half8 c0 = 0.0f;
+  half8 c1 = 0.0f;
+  half8 c2 = 0.0f;
+  half8 c3 = 0.0f;
+
+  for (; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH));
+       src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    half4 a0 = vload4(0, src_addr_a);
+    half8 b0 = vload8(0, src_addr_b);
+
+    c0 += (half8)a0.s0 * b0;
+    c1 += (half8)a0.s1 * b0;
+    c2 += (half8)a0.s2 * b0;
+    c3 += (half8)a0.s3 * b0;
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+    b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
+
+    c0 += (half8)a0.s0 * b0;
+    c1 += (half8)a0.s1 * b0;
+    c2 += (half8)a0.s2 * b0;
+    c3 += (half8)a0.s3 * b0;
+  }
+
+  for (; src_addr_b < src_end_addr_b;
+       src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    half4 a0 = vload4(0, src_addr_a);
+    half8 b0 = vload8(0, src_addr_b);
+
+    c0 += (half8)a0.s0 * b0;
+    c1 += (half8)a0.s1 * b0;
+    c2 += (half8)a0.s2 * b0;
+    c3 += (half8)a0.s3 * b0;
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(4, half, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+  LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+    (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(4, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store 4x8 block
+  vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+  vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+  vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+  vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and
+ * matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
+                                                       IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                       IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                       IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                       uint src1_stride_z,
+#if defined(BETA)
+                                                       uint src2_stride_z,
+#endif // defined(BETA)
+                                                       uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                       ,
+                                                       uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+  int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+  int z = get_global_id(2);
+
+  // Offset
+  const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+  const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+  // src_addr_a = address of matrix A
+  // src_addr_b = address of matrix B
+  int src0_addr_in_bytes =
+    z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+  int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+  __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+  // Compute end row address for matrix B
+  __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+  src_addr_a += offset_row_a;
+  src_addr_b += offset_row_b;
+
+  // Reset accumulators
+  float8 c0 = 0.0f;
+  float8 c1 = 0.0f;
+  float8 c2 = 0.0f;
+  float8 c3 = 0.0f;
+
+  for (; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH));
+       src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = convert_float4(vload4(0, src_addr_a));
+    float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+    c0 += (float8)a0.s0 * b0;
+    c1 += (float8)a0.s1 * b0;
+    c2 += (float8)a0.s2 * b0;
+    c3 += (float8)a0.s3 * b0;
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
+    b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));
+
+    c0 += (float8)a0.s0 * b0;
+    c1 += (float8)a0.s1 * b0;
+    c2 += (float8)a0.s2 * b0;
+    c3 += (float8)a0.s3 * b0;
+  }
+
+  for (; src_addr_b < src_end_addr_b;
+       src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = convert_float4(vload4(0, src_addr_a));
+    float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+    c0 += (float8)a0.s0 * b0;
+    c1 += (float8)a0.s1 * b0;
+    c2 += (float8)a0.s2 * b0;
+    c3 += (float8)a0.s3 * b0;
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+  LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+  float8 bias_f0 = convert_float8(bias0);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(4, c, bias_f0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+    (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+  float8 bias_f0 = convert_float8(bias0);
+  float8 bias_f1 = convert_float8(bias1);
+  float8 bias_f2 = convert_float8(bias2);
+  float8 bias_f3 = convert_float8(bias3);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(4, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(4, c, bias_f);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+  half8 c_h0 = convert_half8(c0);
+  half8 c_h1 = convert_half8(c1);
+  half8 c_h2 = convert_half8(c2);
+  half8 c_h3 = convert_half8(c3);
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c_h, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store 4x8 block
+  vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+  vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+  vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+  vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication
+ * between matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
+                                                         IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                         IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                         IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                         uint src1_stride_z,
+#if defined(BETA)
+                                                         uint src2_stride_z,
+#endif // defined(BETA)
+                                                         uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                         ,
+                                                         uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+  int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+  int z = get_global_id(2);
+
+  // Offset
+  const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+  const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+  // src_addr_a = address of matrix A
+  // src_addr_b = address of matrix B
+  int src0_addr_in_bytes =
+    z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+  int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+  __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+  // Compute end row address for matrix B
+  __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+  src_addr_a += offset_row_a;
+  src_addr_b += offset_row_b;
+
+  // Reset accumulators
+  half8 c0 = 0.0f;
+  half8 c1 = 0.0f;
+  half8 c2 = 0.0f;
+  half8 c3 = 0.0f;
+
+#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))
+
+  int i = 0;
+  for (; i <= (int)(COLS_MTX_B - 4); i += 4)
+  {
+#if MULT_INTERLEAVE4X4_HEIGHT == 1
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    half8 a0 = vload8(0, src_addr_a);
+    half8 b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+
+    // Load values from matrix B (transposed)
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s4, b0, c0);
+    c1 = fma((half8)a0.s5, b0, c1);
+    c2 = fma((half8)a0.s6, b0, c2);
+    c3 = fma((half8)a0.s7, b0, c3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload8(0, src_addr_a);
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+
+    // Load values from matrix B (transposed)
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s4, b0, c0);
+    c1 = fma((half8)a0.s5, b0, c1);
+    c2 = fma((half8)a0.s6, b0, c2);
+    c3 = fma((half8)a0.s7, b0, c3);
+#else  // MULT_INTERLEAVE4X4_HEIGHT == 1
+       // Load values from matrix A (interleaved) and matrix B (transposed)
+    half4 a0 = vload4(0, src_addr_a);
+    half8 b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
+  }
+
+  for (; i < (int)(COLS_MTX_B); ++i)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    half4 a0 = vload4(0, src_addr_a);
+    half8 b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(4, half, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+  LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+    (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(4, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store 4x8 block
+  vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+  vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+  vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+  vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+// Undefine local defines
+#undef COLS_MTX_B
+
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
+
+#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && \
+  (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+#if defined(DATA_TYPE)
+#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not been reshaped.
+ *
+ * @note This OpenCL kernel works with floating point data types (F16/F32)
+ * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g.
+ * -DDATA_TYPE=float)
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
+ * @note The number of matrix A columns and the optional alpha's value need to be passed at compile
+ * time using -DCOLS_A and -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16/F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                     IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                     IMAGE_DECLARATION(dst), uint src0_stride_z, uint src1_stride_z,
+#if defined(BETA)
+                                     uint src2_stride_z,
+#endif // defined(BETA)
+                                     uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                     ,
+                                     uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                     ,
+                                     uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+  // Compute starting address for matrix A and Matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+  // Update address for the matrix A
+  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+  // Update address for the matrix B
+  src_addr.s1 += idx * sizeof(DATA_TYPE);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+              (uint4)HEIGHT_GEMM3D;
+  zin = min(DEPTH_GEMM3D - 1, zin);
+
+  // Add offset due to the cross plane paddings
+  zin *= (src_cross_plane_pad * src0_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src0_stride_z by DEPTH_GEMM3D
+  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));
+
+  VECTOR_TYPE acc0 = 0.0f;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  VECTOR_TYPE acc1 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  VECTOR_TYPE acc2 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  VECTOR_TYPE acc3 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+  for (; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE));
+       src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0,
+               src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+      // Load values from matrix A
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    VECTOR_TYPE b0 =
+      VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+    VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(
+      0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+    // Accumulate
+    acc0 += b0 * (VECTOR_TYPE)a0.s0;
+    acc0 += b1 * (VECTOR_TYPE)a0.s1;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 += b0 * (VECTOR_TYPE)a1.s0;
+    acc1 += b1 * (VECTOR_TYPE)a1.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 += b0 * (VECTOR_TYPE)a2.s0;
+    acc2 += b1 * (VECTOR_TYPE)a2.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 += b0 * (VECTOR_TYPE)a3.s0;
+    acc3 += b1 * (VECTOR_TYPE)a3.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  }
+
+  for (; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    VECTOR_TYPE b0 =
+      VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+
+    // Accumulate
+    acc0 += b0 * (VECTOR_TYPE)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 += b0 * (VECTOR_TYPE)a1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 += b0 * (VECTOR_TYPE)a2;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 += b0 * (VECTOR_TYPE)a3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  }
+
+  int z = get_global_id(2);
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+         (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (dst_cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes +
+    (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y,
+             zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes +
+    (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) +
+    (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+    get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias,
+             src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+  STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc,
+              dst_addr, dst_stride_y, zout.s);
+}
+#endif // defined(DATA_TYPE)
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not been reshaped
+ *
+ * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma
+ * units.
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel
+ * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif // defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                 ,
+                                                 uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+  // Compute starting address for matrix A and matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+  // Update address for matrix A
+  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+  // Update address for matrix B
+  src_addr.s1 += idx * sizeof(float);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+              (uint4)HEIGHT_GEMM3D;
+  zin = min(DEPTH_GEMM3D - 1, zin);
+
+  // Add offset due to the cross plane paddings
+  zin *= (src_cross_plane_pad * src0_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src0_stride_z by DEPTH_GEMM3D
+  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  // Initialize accumulators
+  float4 acc0 = 0.0f;
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  float4 acc1 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  float4 acc2 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  float4 acc3 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+  // A and B src indices get incremented at the same time.
+  int i = 0;
+  for (; i <= ((int)COLS_A - 4); i += 4)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A and matrix B
+    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y,
+               zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+      // Load values from matrix A and matrix B
+    float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
+    acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);
+    acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+    acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);
+    acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);
+    acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+    acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);
+    acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);
+    acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);
+    acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);
+    acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    // Load values from matrix A and matrix B
+    b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);
+    acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);
+    acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+    acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);
+    acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);
+    acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+    acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);
+    acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);
+    acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);
+    acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);
+    acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    // Load values from matrix A and matrix B
+    b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);
+    acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);
+    acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+    acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);
+    acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);
+    acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+    acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);
+    acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);
+    acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);
+    acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);
+    acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    // Load values from matrix A and matrix B
+    b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);
+    acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);
+    acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+    acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);
+    acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);
+    acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+    acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);
+    acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);
+    acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);
+    acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);
+    acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += 4 * sizeof(float);
+  }
+
+  for (; i < (int)COLS_A; ++i)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0, b0.s1, acc0.s1);
+    acc0.s2 = fma(a0, b0.s2, acc0.s2);
+    acc0.s3 = fma(a0, b0.s3, acc0.s3);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1.s0 = fma(a1, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1, b0.s1, acc1.s1);
+    acc1.s2 = fma(a1, b0.s2, acc1.s2);
+    acc1.s3 = fma(a1, b0.s3, acc1.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2.s0 = fma(a2, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2, b0.s1, acc2.s1);
+    acc2.s2 = fma(a2, b0.s2, acc2.s2);
+    acc2.s3 = fma(a2, b0.s3, acc2.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3.s0 = fma(a3, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3, b0.s1, acc3.s1);
+    acc3.s2 = fma(a3, b0.s2, acc3.s2);
+    acc3.s3 = fma(a3, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += sizeof(float);
+  }
+
+  int z = get_global_id(2);
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+         (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (dst_cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+  LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) +
+    (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+    get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias
+  ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store the output block
+  vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not been reshaped
+ *
+ * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma
+ * units. This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or
+ * equal to 1000.
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel
+ * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if
+ * alpha!=1.0f.
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
+                                                      IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                      IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                      IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                      uint src1_stride_z,
+#if defined(BETA)
+                                                      uint src2_stride_z,
+#endif // defined(BETA)
+                                                      uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                      ,
+                                                      uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                      ,
+                                                      uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for
+  // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+  // Compute starting address for matrix A and Matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+  // Update address for the matrix A
+  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+  // Update address for the matrix B
+  src_addr.s1 += idx * sizeof(float);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+              (uint4)HEIGHT_GEMM3D;
+  zin = min(DEPTH_GEMM3D - 1, zin);
+
+  // Add offset due to the cross plane paddings
+  zin *= (src_cross_plane_pad * src0_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src0_stride_z by DEPTH_GEMM3D
+  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  // Initialize accumulators
+  float2 acc0 = 0.0f;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  float2 acc1 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  float2 acc2 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  float2 acc3 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+  // A and B src indices get incremented at the same time.
+  int i = 0;
+  for (; i <= ((int)COLS_A - 8); i += 8)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
+    acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);
+    acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);
+    acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);
+    acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);
+    acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);
+    acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);
+    acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);
+
+    acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
+    acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);
+    acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);
+    acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);
+    acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);
+    acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);
+    acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);
+    acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+    acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);
+    acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);
+    acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);
+    acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);
+    acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);
+    acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);
+    acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);
+    acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);
+
+    acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);
+    acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);
+    acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);
+    acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);
+    acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);
+    acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);
+    acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);
+    acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+    acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);
+    acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);
+    acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);
+    acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);
+    acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);
+    acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);
+    acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);
+    acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);
+
+    acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);
+    acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);
+    acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);
+    acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);
+    acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);
+    acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);
+    acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);
+    acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+    acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);
+    acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);
+    acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);
+    acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);
+    acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);
+    acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);
+    acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);
+    acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);
+
+    acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);
+    acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);
+    acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);
+    acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);
+    acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);
+    acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);
+    acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);
+    acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += sizeof(float) * 8;
+  }
+  // float size increment
+  for (; i < (int)COLS_A; ++i)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0, b0.s1, acc0.s1);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1.s0 = fma(a1, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1, b0.s1, acc1.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2.s0 = fma(a2, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2, b0.s1, acc2.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3.s0 = fma(a3, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3, b0.s1, acc3.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += sizeof(float);
+  }
+
+  int z = get_global_id(2);
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+         (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (dst_cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));
+
+  LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) +
+    (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+    get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias
+  ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store the output block
+  vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating
+ * the result in a 32 floating point variable.
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel
+ * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
+                                                       IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                       IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                       IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                       uint src1_stride_z,
+#if defined(BETA)
+                                                       uint src2_stride_z,
+#endif // defined(BETA)
+                                                       uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                       ,
+                                                       uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                       ,
+                                                       uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+  // Compute starting address for matrix A and Matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+  // Update address for the matrix A
+  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+  // Update address for the matrix B
+  src_addr.s1 += idx * sizeof(half);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+              (uint4)HEIGHT_GEMM3D;
+  zin = min(DEPTH_GEMM3D - 1, zin);
+
+  // Add offset due to the cross plane paddings
+  zin *= (src_cross_plane_pad * src0_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src0_stride_z by DEPTH_GEMM3D
+  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  float8 acc0 = 0.0h;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  float8 acc1 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  float8 acc2 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  float8 acc3 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+  int i = 0;
+  for (; i <= ((int)COLS_A - 4); i += 4)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y,
+               zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+      // Load values from matrix A
+    half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+    src_addr.s1 += src1_stride_y;
+
+    // Accumulate
+    acc0 = fma(b0, (float8)a0.s0, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (float8)a1.s0, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (float8)a2.s0, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (float8)a3.s0, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (float8)a0.s1, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (float8)a1.s1, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (float8)a2.s1, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (float8)a3.s1, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (float8)a0.s2, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (float8)a1.s2, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (float8)a2.s2, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (float8)a3.s2, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (float8)a0.s3, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (float8)a1.s3, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (float8)a2.s3, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (float8)a3.s3, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += 4 * sizeof(half);
+  }
+
+  for (; i < (int)COLS_A; ++i)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+
+    src_addr += (int2)(sizeof(half), src1_stride_y);
+
+    // Accumulate
+    acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;
+#endif                                // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;
+#endif                                // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;
+#endif                                // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  }
+
+  int z = get_global_id(2);
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+         (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (dst_cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+  LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+  float8 bias_f0 = convert_float8(bias0);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+    (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+    get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+  float8 bias_f0 = convert_float8(bias0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  float8 bias_f1 = convert_float8(bias1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  float8 bias_f2 = convert_float8(bias2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  float8 bias_f3 = convert_float8(bias3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias
+  ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+  half8 acc_h0 = convert_half8(acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  half8 acc_h1 = convert_half8(acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  half8 acc_h2 = convert_half8(acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  half8 acc_h3 = convert_half8(acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc_h, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store the output block
+  STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s);
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma
+ * units.
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel
+ * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif // defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                 ,
+                                                 uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+  // Compute starting address for matrix A and Matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+  // Update address for the matrix A
+  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+  // Update address for the matrix B
+  src_addr.s1 += idx * sizeof(half);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+              (uint4)HEIGHT_GEMM3D;
+  zin = min(DEPTH_GEMM3D - 1, zin);
+
+  // Add offset due to the cross plane paddings
+  zin *= (src_cross_plane_pad * src0_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src0_stride_z by DEPTH_GEMM3D
+  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  half8 acc0 = 0.0h;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  half8 acc1 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  half8 acc2 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  half8 acc3 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+  int i = 0;
+  for (; i <= ((int)COLS_A - 4); i += 4)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y,
+               zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+      // Load values from matrix A
+    half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Accumulate
+    acc0 = fma(b0, (half8)a0.s0, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (half8)a1.s0, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (half8)a2.s0, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (half8)a3.s0, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (half8)a0.s1, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (half8)a1.s1, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (half8)a2.s1, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (half8)a3.s1, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (half8)a0.s2, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (half8)a1.s2, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (half8)a2.s2, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (half8)a3.s2, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (half8)a0.s3, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (half8)a1.s3, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (half8)a2.s3, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (half8)a3.s3, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += 4 * sizeof(half);
+  }
+
+  for (; i < (int)COLS_A; ++i)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+
+    src_addr += (int2)(sizeof(half), src1_stride_y);
+
+    // Accumulate
+    acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;
+#endif                               // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;
+#endif                               // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;
+#endif                               // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  }
+
+  int z = get_global_id(2);
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+         (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (dst_cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+  LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+    (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+    get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias
+  ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store the output block
+  STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s);
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) &&
+       // (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+
+#if defined(BETA)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account
+ * that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types:
+ * F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+  Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+  // Load values from A x B
+  float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
+
+  // Load values from Matrix C
+  float4 c = vload4(0, (__global float *)src.ptr);
+
+  // Computes alpha * axb + beta * c
+  float4 out = alpha_ab + (float4)BETA * c;
+
+  // Store final result in axb matrix
+  vstore4(out, 0, (__global float *)dst.ptr);
+}
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account
+ * that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types:
+ * F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+  Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+  // Load values from A x B
+  half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
+
+  // Load values from Matrix C
+  half8 c = vload8(0, (__global half *)src.ptr);
+
+  // Computes alpha * axb + beta * c
+  half8 out = alpha_ab + (half8)BETA * c;
+
+  // Store final result in axb matrix
+  vstore8(out, 0, (__global half *)dst.ptr);
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#endif // defined(BETA)
+
+#if defined(WIDTH_VECTOR_A)
+/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and
+ * matrix B (src1) used for locally connected layer
+ *
+ * @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
+ *
+ * @note The input A and matrix B must not be reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0), TENSOR3D_DECLARATION(src1),
+                             IMAGE_DECLARATION(dst))
+{
+  int idx = get_global_id(0) * 4;
+  int idy = get_global_id(1);
+
+  // Compute the address for the vector A and matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy,
+                          src1_offset_first_element_in_bytes + src1_stride_z * idy));
+  src_addr.s1 += idx * sizeof(float);
+
+  int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
+
+  float4 acc = 0.0f;
+
+  for (; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float));
+       src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+  {
+    float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
+    float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+    acc += b0 * (float4)a0.s0;
+    acc += b1 * (float4)a0.s1;
+  }
+
+  for (; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
+  {
+    float a0 = *((__global float *)(src0_ptr + src_addr.s0));
+    float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+
+    acc += b0 * (float4)a0;
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
+}
+#endif // defined(WIDTH_VECTOR_A)
+
+/** This kernel accumulates each row with the biases vector.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short.
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16.
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported
+ * data type: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X
+ * dimension (in bytes)
+ * @param[in]      accum_step_x                         accum_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y
+ * dimension (in bytes)
+ * @param[in]      accum_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the
+ * accumulate tensor
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p
+ * accum_ptr
+ * @param[in]      biases_stride_x                      Stride of the destination tensor in X
+ * dimension (in bytes)
+ * @param[in]      biases_step_x                        dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ */
+#if defined(DATA_TYPE) && defined(VECTOR_SIZE)
+__kernel void gemm_accumulate_biases(IMAGE_DECLARATION(accum), VECTOR_DECLARATION(biases))
+{
+  Image accum = CONVERT_TO_IMAGE_STRUCT(accum);
+  Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+  // Vector size, e.g. number of vector elements.
+  VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+  accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr);
+  VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+  biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
+  accum_value = biases_value + accum_value;
+  // Store result in the accumulate buffer
+  VSTORE(VECTOR_SIZE)
+  (accum_value, 0, (__global DATA_TYPE *)accum.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(VECTOR_SIZE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h
new file mode 100644
index 000000000..0c75d061f
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h
@@ -0,0 +1,1235 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+
+/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
+ * @name LOAD_ROW_n
+ *
+ * @param[in] N0        The number of rows to load
+ * @param[in] DATA_TYPE The data type of variables
+ * @param[in] BASENAME  The basename of the destination variables for the loaded rows
+ * @param[in] PTR       The base pointer
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The z-axis offset vector
+ * @{
+ */
+#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
+
+#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
+
+#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
+
+#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
+
+#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
+
+#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
+
+#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
+
+#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
+
+#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
+
+#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)        \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
+
+#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
+
+#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
+
+#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
+
+#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
+
+#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
+
+#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
+
+/** @}*/ // end of group LOAD_ROW_n
+
+/** Load Blocks (consecutive rows and columns) with Z offset.
+ * @name LOAD_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of consecutive rows
+ * @param[in] N0        The number of consecutive columns
+ * @param[in] DATA_TYPE The data type of the target
+ * @param[in] BASENAME  The basename of the result variables
+ * @param[in] PTR       The base pointer for the data
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride in y-axis direction
+ * @param[in] Z         The z-axis offset vector
+ * @{
+ */
+#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+/** @} */ // end of group LOAD_BLOCK
+
+/** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
+ * @name LOAD_ELEMENT_n
+ *
+ * @param[in] N0        The number of rows to load
+ * @param[in] DATA_TYPE The data type of variables
+ * @param[in] BASENAME  The basename of the destination variables for the loaded rows
+ * @param[in] PTR       The base pointer
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @{
+ */
+#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y));
+
+#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y));
+
+#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y));
+
+#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y));
+
+#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y));
+
+#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y));
+
+#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y));
+
+#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y));
+
+#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y));
+
+#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)        \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y));
+
+#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y));
+
+#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y));
+
+#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y));
+
+#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y));
+
+#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y));
+
+#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y));
+
+/** @}*/ // end of group LOAD_ELEMENT_n
+
+/** Load Scalar as Vector (consecutive elements).
+ * @name LOAD_SCALAR_AS_VECTOR
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
+ *
+ * @param[in] M0        The number of consecutive rows
+ * @param[in] N0        The number of consecutive columns
+ * @param[in] DATA_TYPE The data type of the target
+ * @param[in] BASENAME  The basename of the result variables
+ * @param[in] PTR       The base pointer for the data
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride in y-axis direction
+ * @{
+ */
+#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+/** @} */ // end of group LOAD_SCALAR_AS_VECTOR
+
+/** Basic macros to calculate Z offset values from Z0 to Zn-1
+ * @name CALCULATE_Z_OFFSET_n
+ *
+ * @param[in] M0              The number of offset values to calculate
+ * @param[in] DATA_TYPE       The data type of the results
+ * @param[in] Z               The basename of the result variables
+ * @param[in] Y               The work-itme ID of y-axis
+ * @param[in] HEIGHT_GEMM3D   The height of GEMM3D
+ * @param[in] DEPTH_GEMM3D    The depth of GEMM3D
+ * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
+ * @param[in] STRIDE_Y        The stride value in y-axis direction
+ *
+ * @{
+ */
+#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  Z##0 = (0 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                              \
+  Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##1 = (1 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                              \
+  Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##2 = (2 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                              \
+  Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##3 = (3 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                              \
+  Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##4 = (4 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                              \
+  Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##5 = (5 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                              \
+  Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##6 = (6 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                              \
+  Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##7 = (7 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                              \
+  Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+/** @} */ // end of group CALCULATE_Z_OFFSET_n
+
+/** Calculate Z offset values from Z0 to Zn-1
+ * @name CALCULATE_Z_OFFSET
+ *
+ * The Z offsets are expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected names of Z offsets are zin1, zin2, zin3.
+ * Note that, CROSS_PLANE_PAD (cross plain padding) is required to take into account
+ * the possible cross plane paddings in case of the plance changes across the z-dimension.
+ *
+ * <!--
+ * |                  |
+ * |      plane0      |
+ * |                  |
+ * |__________________|
+ * |******************|
+ * |  cross_plane_pad |
+ * |******************|
+ * |                  |
+ * |      plane1      |
+ * |                  |
+ * |__________________|
+ * -->
+ *
+ * @param[in] M0              The number of offset values to calculate
+ * @param[in] DATA_TYPE       The data type of the results
+ * @param[in] Z               The basename of the result variables
+ * @param[in] Y               The work-itme ID of y-axis
+ * @param[in] HEIGHT_GEMM3D   The height of GEMM3D
+ * @param[in] DEPTH_GEMM3D    The depth of GEMM3D
+ * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
+ * @param[in] STRIDE_Y        The stride value in y-axis direction
+ * @{
+ */
+#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                               STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,      \
+                          STRIDE_Y)
+#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                           STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,   \
+                         STRIDE_Y)
+/** @} */ // end of group CALCULATE_Z_OFFSET
+
+/** Store the 0 to (n-1)th rows of the given variables
+ * @name STORE_ROW_n
+ *
+ * @param[in] N0        The size of the vectors
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  VSTORE(N0)                                                   \
+  (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)        \
+  VSTORE(N0)                                                    \
+  (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+/** @} */ // end of groupd STORE_ROW_n
+
+/** Convert and store the 0th to (n-1)th rows of the given variables
+ * @name CONVERT_STORE_ROW_n
+ *
+ * @param[in] N0        The size of the vectors
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)   \
+  VSTORE(N0)                                                       \
+  (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,    \
+   (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+
+/** @} */ // end of groupd CONVERT_STORE_ROW_n
+
+/** Store a block of the given size M0xN0
+ * @name STORE_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of rows to store
+ * @param[in] N0        The size of each vector
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @} */ // end of group STORE_BLOCK
+
+/** Convert and store a block of the given size M0xN0
+ * @name CONVERT_STORE_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of rows to store
+ * @param[in] N0        The size of each vector
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @} */ // end of group CONVERT_STORE_BLOCK
+
+/** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
+ * @name SCALE_ROW_n
+ *
+ * @param[in] DATA_TYPE The data type of the variables
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] SCALE     The scale factor
+ * @{
+ */
+#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) BASENAME##0 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##1 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##2 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##3 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##4 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##5 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##6 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##7 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##8 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE)        \
+  BASENAME##9 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##A *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##B *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##C *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##D *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##E *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##F *= (DATA_TYPE)SCALE;
+/** @} */ // end of group SCALE_ROW_n
+
+/** Scale elements stored in a block (BASENAME)
+ * @name SCALE_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16
+ *
+ * @param[in] N         The number of rows in the block
+ * @param[in] DATA_TYPE The data type of the block
+ * @param[in] BASENAME  The basename of the block
+ * @param[in] SCALE     The scale factor
+ * @{
+ */
+#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
+#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
+/** @} */ // end of group SCALE_BLOCK
+
+/** Create a new vector containing the values at the given index for a set of given vectors
+ * @name COLUMN_VECTORn
+ *
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] X        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ * @{
+ */
+#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
+  TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
+#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 2)                           \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
+#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 3)                           \
+  BASENAME##IDX_COL =                              \
+    (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
+#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE)                                   \
+  VEC_DATA_TYPE(TYPE, 4)                                                             \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, \
+                                               (X##2).s##IDX_COL, (X##3).s##IDX_COL);
+#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE)                                                 \
+  VEC_DATA_TYPE(TYPE, 8)                                                                           \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))(                                                    \
+    (X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, \
+    (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
+#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE)                                                \
+  VEC_DATA_TYPE(TYPE, 16)                                                                          \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))(                                                   \
+    (X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, \
+    (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, \
+    (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, \
+    (X##F).s##IDX_COL);
+/** @} */ // end of group COLUMN_VECTORn
+
+/** Create a new vector containing the values at the given index. Utility macros for transposing a
+ * colum-vector
+ * @name COLUMN_VECTOR_SCALARn
+ *
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] X        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ * @{
+ */
+#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0));
+#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 2)                                  \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
+#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 3)                                  \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2));
+#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 4)                                  \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3));
+#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 8)                                  \
+  BASENAME##IDX_COL =                                     \
+    (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
+#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE)                                    \
+  VEC_DATA_TYPE(TYPE, 16)                                                                     \
+  BASENAME##IDX_COL =                                                                         \
+    (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), \
+                              (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
+/** @} */ // end of group COLUMN_VECTORn
+
+/** Create transposed vectors of the given vectors
+ * @name TRANSPOSE_K0Xn
+ *
+ * @param[in] K0       The size of the source vectors
+ * @param[in] BASENAME The basename of transposed vectors
+ * @param[in] B        The basename of source vectors for transposition
+ * @param[in] TYPE     The data type of the transposed vectors
+ * @{
+ */
+#define TRANSPOSE_K0X1(K0, BASENAME, B, TYPE) COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X2(K0, BASENAME, B, TYPE) \
+  COLUMN_VECTOR(K0, 0, BASENAME, B, TYPE);    \
+  COLUMN_VECTOR(K0, 1, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X3(K0, BASENAME, B, TYPE) \
+  TRANSPOSE_K0X2(K0, BASENAME, B, TYPE);      \
+  COLUMN_VECTOR(K0, 2, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X4(K0, BASENAME, B, TYPE) \
+  TRANSPOSE_K0X3(K0, BASENAME, B, TYPE);      \
+  COLUMN_VECTOR(K0, 3, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X8(K0, BASENAME, B, TYPE) \
+  TRANSPOSE_K0X4(K0, BASENAME, B, TYPE);      \
+  COLUMN_VECTOR(K0, 4, BASENAME, B, TYPE);    \
+  COLUMN_VECTOR(K0, 5, BASENAME, B, TYPE);    \
+  COLUMN_VECTOR(K0, 6, BASENAME, B, TYPE);    \
+  COLUMN_VECTOR(K0, 7, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X16(K0, BASENAME, B, TYPE) \
+  TRANSPOSE_K0X8(K0, BASENAME, B, TYPE);       \
+  COLUMN_VECTOR(K0, 8, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, 9, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, A, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, B, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, C, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, D, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, E, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, F, BASENAME, B, TYPE);
+
+/** @} */ // end of group TRANSPOSE_K0Xn
+
+/** Create column vectors to contain the values at the given index for a set of given vectors
+ *
+ * @param[in] K0       The number of source vectors
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] B        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ */
+#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, B, TYPE) \
+  CONCAT(COLUMN_VECTOR, K0)                           \
+  (IDX_COL, BASENAME, B, TYPE);
+
+/** Create column vectors to contain the values at the given index. Utility macro for transposing a
+ * column-vector
+ *
+ * @param[in] K0       The number of source vectors
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] B        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ */
+#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, B, TYPE) \
+  CONCAT(COLUMN_VECTOR_SCALAR, K0)                           \
+  (IDX_COL, BASENAME, B, TYPE);
+
+/** Create transposed vectors form the given source vectors
+ *
+ * @param[in] K0       The size of source vectors
+ * @param[in] N0       The number of source vectors
+ * @param[in] BASENAME The basename of transposed vectors
+ * @param[in] B        The basename of source vectors for transposition
+ * @param[in] TYPE     The data type of the transposed vectors
+ *
+ */
+#define TRANSPOSE_K0XN0(K0, N0, BASENAME, B, TYPE) \
+  CONCAT(TRANSPOSE_K0X, N0)                        \
+  (K0, BASENAME, B, TYPE);
+
+/** Add the variables (BIAS0 to BIASn-1) to the others (BASENAME0 to BASENAMEn-1)
+ * @name ADD_ROW_n
+ *
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The basename of the added variables
+ * @{
+ */
+#define ADD_ROW_1(BASENAME, BIAS) BASENAME##0 += BIAS##0;
+
+#define ADD_ROW_2(BASENAME, BIAS) \
+  ADD_ROW_1(BASENAME, BIAS)       \
+  BASENAME##1 += BIAS##1;
+
+#define ADD_ROW_3(BASENAME, BIAS) \
+  ADD_ROW_2(BASENAME, BIAS)       \
+  BASENAME##2 += BIAS##2;
+
+#define ADD_ROW_4(BASENAME, BIAS) \
+  ADD_ROW_3(BASENAME, BIAS)       \
+  BASENAME##3 += BIAS##3;
+
+#define ADD_ROW_5(BASENAME, BIAS) \
+  ADD_ROW_4(BASENAME, BIAS)       \
+  BASENAME##4 += BIAS##4;
+
+#define ADD_ROW_6(BASENAME, BIAS) \
+  ADD_ROW_5(BASENAME, BIAS)       \
+  BASENAME##5 += BIAS##5;
+
+#define ADD_ROW_7(BASENAME, BIAS) \
+  ADD_ROW_6(BASENAME, BIAS)       \
+  BASENAME##6 += BIAS##6;
+
+#define ADD_ROW_8(BASENAME, BIAS) \
+  ADD_ROW_7(BASENAME, BIAS)       \
+  BASENAME##7 += BIAS##7;
+
+#define ADD_ROW_9(BASENAME, BIAS) \
+  ADD_ROW_8(BASENAME, BIAS)       \
+  BASENAME##8 += BIAS##8;
+
+#define ADD_ROW_10(BASENAME, BIAS) \
+  ADD_ROW_9(BASENAME, BIAS)        \
+  BASENAME##9 += BIAS##9;
+
+#define ADD_ROW_11(BASENAME, BIAS) \
+  ADD_ROW_10(BASENAME, BIAS)       \
+  BASENAME##A += BIAS##A;
+
+#define ADD_ROW_12(BASENAME, BIAS) \
+  ADD_ROW_11(BASENAME, BIAS)       \
+  BASENAME##B += BIAS##B;
+
+#define ADD_ROW_13(BASENAME, BIAS) \
+  ADD_ROW_12(BASENAME, BIAS)       \
+  BASENAME##C += BIAS##C;
+
+#define ADD_ROW_14(BASENAME, BIAS) \
+  ADD_ROW_13(BASENAME, BIAS)       \
+  BASENAME##D += BIAS##D;
+
+#define ADD_ROW_15(BASENAME, BIAS) \
+  ADD_ROW_14(BASENAME, BIAS)       \
+  BASENAME##E += BIAS##E;
+
+#define ADD_ROW_16(BASENAME, BIAS) \
+  ADD_ROW_15(BASENAME, BIAS)       \
+  BASENAME##F += BIAS##F;
+
+/** @} */ // end of group ADD_ROW_n
+
+/** Add the block (BIAS) to another block (BASENAME)
+ * @name ADD_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16
+ *
+ * @param[in] N        The number of vectors in the block
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The basename of the added variables
+ * @{
+ */
+#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
+#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
+/** @} */ // end of group ADD_BLOCK
+
+/** Broadcast (add single value) to the each element of the destination variables
+ * @name ADD_ROW_BROADCAST_n
+ *
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The variable containing the value to add
+ * @{
+ */
+#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) BASENAME##0 += BIAS;
+
+#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_1(BASENAME, BIAS)       \
+  BASENAME##1 += BIAS;
+
+#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_2(BASENAME, BIAS)       \
+  BASENAME##2 += BIAS;
+
+#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_3(BASENAME, BIAS)       \
+  BASENAME##3 += BIAS;
+
+#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_4(BASENAME, BIAS)       \
+  BASENAME##4 += BIAS;
+
+#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_5(BASENAME, BIAS)       \
+  BASENAME##5 += BIAS;
+
+#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_6(BASENAME, BIAS)       \
+  BASENAME##6 += BIAS;
+
+#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_7(BASENAME, BIAS)       \
+  BASENAME##7 += BIAS;
+
+#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_8(BASENAME, BIAS)       \
+  BASENAME##8 += BIAS;
+
+#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_9(BASENAME, BIAS)        \
+  BASENAME##9 += BIAS;
+
+#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_10(BASENAME, BIAS)       \
+  BASENAME##A += BIAS;
+
+#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_11(BASENAME, BIAS)       \
+  BASENAME##B += BIAS;
+
+#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_12(BASENAME, BIAS)       \
+  BASENAME##C += BIAS;
+
+#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_13(BASENAME, BIAS)       \
+  BASENAME##D += BIAS;
+
+#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_14(BASENAME, BIAS)       \
+  BASENAME##E += BIAS;
+
+#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_15(BASENAME, BIAS)       \
+  BASENAME##F += BIAS;
+
+/** Broadcast (add a value) to the each element of the destination block (BASENAME)
+ * @name ADD_BLOCK_BROADCAST
+ *
+ * Supported cases are N=1,2,3,...,16.
+ *
+ * @param[in] N        The number of vectors in the block
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The variable containing the value to add
+ * @{
+ */
+#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
+#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
+/** @} */ // end of group ADD_BLOCK_BROADCAST
+
+/** Apply activation to the given variables
+ * @name ACTIVATION_ROW_n
+ *
+ * @param[in] ACTIVATION_TYPE The type of the activation
+ * @param[in] DATA_TYPE       The data type of the vectors
+ * @param[in] BASENAME        The basename of the variables
+ * @param[in] A_VAL           Additional value required by the activation
+ * @param[in] B_VAL           Additional value required by the activation
+ * @{
+ */
+#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##0, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##1, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##2, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##3, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##4, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##5, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##6, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##7, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##8, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)        \
+  BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##9, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##A, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##B, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##C, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##D, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##E, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##F, A_VAL, B_VAL);
+/** @} */ // end of group ACTIVATION_ROW_n
+
+/** Apply activation to a block (BASENAME)
+ * @name ACTIVATION_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16.
+ *
+ * @param[in] N               The number of vectors in the block
+ * @param[in] ACTIVATION_TYPE The type of the activation
+ * @param[in] DATA_TYPE       The data type of the vectors
+ * @param[in] BASENAME        The basename of the variables
+ * @param[in] A_VAL           Additional value required by the activation
+ * @param[in] B_VAL           Additional value required by the activation
+ * @{
+ */
+#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
+/** @} */ // end of group ACTIVATION_BLOCK
+
+/** Apply convert_<data_type> to the given variables
+ * @name CONVERT_ROW_n
+ *
+ * @param[in] N            The size of the vectors
+ * @param[in] DATA_TYPE    The data type of the vectors
+ * @param[in] BASENAME_SRC The basename of the source variables
+ * @param[in] BASENAME_DST The basename of the destination variables
+ */
+#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)        \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
+/** @} */ // end of group CONVERT_ROW_n
+
+/** Apply convert_<data_type> to a block (BASENAME_SRC) and save to another block (BASENAME_DST)
+ * @name CONVERT_BLOCK
+ *
+ * Supported cases N=1,2,3,...,16.
+ *
+ * @param[in] M            The number of vectors to convert
+ * @param[in] N            The size of the vectors
+ * @param[in] DATA_TYPE    The data type of the vectors
+ * @param[in] BASENAME_SRC The basename of the source variables
+ * @param[in] BASENAME_DST The basename of the destination variables
+ */
+#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+/** @} */ // end of group CONVERT_BLOCK
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl
new file mode 100644
index 000000000..2d9acc753
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl
@@ -0,0 +1,2733 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "helpers_asymm.h"
+#include "repeat.h"
+
+#if defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
+  defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
+      // defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
+       // defined(cl_arm_integer_dot_product_accumulate_int8)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform the dot product instruction between two vectors of size N [1,16].
+ * These macros use the dot8 instruction */
+#define ARM_DOT1(a, b, c)                                                         \
+  ({                                                                              \
+    ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0),     \
+            (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \
+  })
+#define ARM_DOT2(a, b, c)                                                         \
+  ({                                                                              \
+    ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0),     \
+            (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \
+  })
+#define ARM_DOT3(a, b, c)                                       \
+  ({                                                            \
+    ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0),     \
+            (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \
+  })
+#define ARM_DOT4(a, b, c) ({ ARM_DOT(a, b, c); })
+#define ARM_DOT8(a, b, c)        \
+  ({                             \
+    ARM_DOT4((a.lo), (b.lo), c); \
+    ARM_DOT4((a.hi), (b.hi), c); \
+  })
+#define ARM_DOT16(a, b, c)       \
+  ({                             \
+    ARM_DOT8((a.lo), (b.lo), c); \
+    ARM_DOT8((a.hi), (b.hi), c); \
+  })
+
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform the dot product instruction between two vectors of size K0 [1,16]
+ * without using the dot8 instruction. */
+#define ARM_DOT1(a, b, c) ({ c += (ACC_DATA_TYPE)a * b; })
+#define ARM_DOT2(a, b, c)            \
+  ({                                 \
+    c += (ACC_DATA_TYPE)a.s0 * b.s0; \
+    c += (ACC_DATA_TYPE)a.s1 * b.s1; \
+  })
+#define ARM_DOT3(a, b, c)            \
+  ({                                 \
+    ARM_DOT2(a, b, c);               \
+    c += (ACC_DATA_TYPE)a.s2 * b.s2; \
+  })
+#define ARM_DOT4(a, b, c)            \
+  ({                                 \
+    ARM_DOT3(a, b, c);               \
+    c += (ACC_DATA_TYPE)a.s3 * b.s3; \
+  })
+#define ARM_DOT8(a, b, c)        \
+  ({                             \
+    ARM_DOT4((a.lo), (b.lo), c); \
+    ARM_DOT4((a.hi), (b.hi), c); \
+  })
+#define ARM_DOT16(a, b, c)       \
+  ({                             \
+    ARM_DOT8((a.lo), (b.lo), c); \
+    ARM_DOT8((a.hi), (b.hi), c); \
+  })
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0
+ * vectors "b" of size K0 [1,16] */
+#define ARM_DOT_K0X1(k0, a, b, c) ({ ARM_DOT_K0(k0, (a), (b##0), (c)); })
+#define ARM_DOT_K0X2(k0, a, b, c)        \
+  ({                                     \
+    ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \
+    ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \
+  })
+#define ARM_DOT_K0X3(k0, a, b, c)        \
+  ({                                     \
+    ARM_DOT_K0X2(k0, a, b, c);           \
+    ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \
+  })
+#define ARM_DOT_K0X4(k0, a, b, c)        \
+  ({                                     \
+    ARM_DOT_K0X3(k0, a, b, c);           \
+    ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \
+  })
+#define ARM_DOT_K0X8(k0, a, b, c)        \
+  ({                                     \
+    ARM_DOT_K0X4(k0, a, b, c);           \
+    ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \
+    ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \
+    ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \
+    ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \
+  })
+#define ARM_DOT_K0X16(k0, a, b, c)       \
+  ({                                     \
+    ARM_DOT_K0X8(k0, a, b, c);           \
+    ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \
+    ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \
+    ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \
+    ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \
+    ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \
+    ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \
+    ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \
+    ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \
+  })
+
+/** Specialized macros to perform a partial matrix multiplication with dimensions M0,N0,K0 */
+#define ARM_MM_K0XN0X1(n0, k0, a, b, c) ({ ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); })
+#define ARM_MM_K0XN0X2(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X1(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \
+  })
+#define ARM_MM_K0XN0X3(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X2(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \
+  })
+#define ARM_MM_K0XN0X4(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X3(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \
+  })
+#define ARM_MM_K0XN0X5(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X4(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \
+  })
+#define ARM_MM_K0XN0X6(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X5(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \
+  })
+#define ARM_MM_K0XN0X7(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X6(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \
+  })
+#define ARM_MM_K0XN0X8(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X7(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \
+  })
+
+#define ARM_DOT_K0(k0, a, b, c) \
+  ({                            \
+    CONCAT(ARM_DOT, k0)         \
+    ((a), (b), (c));            \
+  })
+
+#define ARM_DOT_K0XN0(n0, k0, a, b, c) \
+  ({                                   \
+    CONCAT(ARM_DOT_K0X, n0)            \
+    (k0, (a), b, (c));                 \
+  })
+
+#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \
+  ({                                         \
+    CONCAT(ARM_MM_K0XN0X, m0)                \
+    (n0, k0, a, b, c);                       \
+  })
+
+/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0
+ * vectors "b" of size K0 [1,16] */
+#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c) ({ c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; })
+#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c)    \
+  ({                                              \
+    c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \
+    c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \
+  })
+#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c)    \
+  ({                                              \
+    ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c);       \
+    c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \
+  })
+#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c)    \
+  ({                                              \
+    ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c);       \
+    c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \
+  })
+#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c)    \
+  ({                                              \
+    ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c);       \
+    c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \
+    c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \
+    c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \
+    c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \
+  })
+#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c)   \
+  ({                                              \
+    ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c);       \
+    c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \
+    c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \
+    c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \
+    c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \
+    c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \
+    c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \
+    c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \
+    c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \
+  })
+/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */
+#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); })
+#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7));  \
+  })
+#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                \
+    CONCAT(ARM_MUL_N0X, k0)                         \
+    (VECTOR_ACC_TYPE, (a), b, (c));                 \
+  })
+#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \
+  ({                                                             \
+    CONCAT(ARM_MM_NATIVE_N0XK0X, m0)                             \
+    (VECTOR_ACC_TYPE, k0, a, b, c);                              \
+  })
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && \
+  defined(N)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with
+ * QASYMM/QASYMM_SIGNED data type. The LHS matrix must be reshaped with @ref
+ * CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed The RHS matrix must be reshaped
+ * with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e.
+ * -DACC_DATA_TYPE=uint)
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52
+ * and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0)
+ * must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS
+ * matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option
+ * -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data
+ * type: QASYMM8/QASYMM_SIGNED
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped
+ * matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped
+ * matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in]  k                                 Number of columns in LHS matrix and rows in RHS
+ * matrix not reshaped.
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+                                                IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z,
+                                                uint rhs_stride_z, uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                ,
+                                                uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  __global DATA_TYPE *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes +
+                                 (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y +
+                                 (z * lhs_stride_z);
+
+  // Compute RHS matrix address
+  __global DATA_TYPE *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes +
+                                 (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_addr += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+  for (int i = 0; i < k; i += K0)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X, zrhs);
+
+    // Partial matrix multiplication M0,N0,K0
+    ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+
+    // Update address
+    lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
+    rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Convert and store output block
+  CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is
+ * transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e.
+ * -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at
+ * compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data
+ * type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped
+ * matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped
+ * matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in
+ * unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+                                              IMAGE_DECLARATION(dst), uint lhs_stride_z,
+                                              uint rhs_stride_z, uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                              ,
+                                              uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                              ,
+                                              uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X +
+                    (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+  for (int i = 0; i < K; i += K0)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
+
+    // Partial matrix multiplication M0,N0,K0
+    ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+
+    lhs_offset += K0;
+    rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Convert and store output block
+  CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#if defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with fused output stage
+ * using fixed-point arithmetic. The LHS matrix is NOT reshaped The RHS matrix is reshaped with @ref
+ * CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e.
+ * -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at
+ * compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @note The offset, scalar scale factor and number of bits to shift right of output tensor must be
+ * passed at compile time using -DRESULT_OFFSET, -RESULT_MULTIPLIER and -DRESULT_SHIFT
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ * @note In case of per-channel quantization of matrix B, -DPER_CHANNEL_QUANTIZATION must be passed
+ * at compile time.
+ *
+ * @param[in]  lhs_ptr                                          Pointer to the LHS reshaped matrix.
+ * Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_x                                     Stride of the LHS reshaped matrix in
+ * X dimension (in bytes)
+ * @param[in]  lhs_step_x                                       src_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                                     Stride of the LHS reshaped matrix in
+ * Y dimension (in bytes)
+ * @param[in]  lhs_step_y                                       src_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes                The offset of the first element in
+ * the LHS reshaped matrix
+ * @param[in]  rhs_ptr                                          Pointer to the RHS reshaped matrix.
+ * Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                                     Stride of the RHS reshaped matrix in
+ * X dimension (in bytes)
+ * @param[in]  rhs_step_x                                       src_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                                     Stride of the RHS reshaped matrix in
+ * Y dimension (in bytes)
+ * @param[in]  rhs_step_y                                       src_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes                The offset of the first element in
+ * the RHS reshaped matrix
+ * @param[out] dst_ptr                                          Pointer to the destination matrix
+ * Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                                     Stride of the destination matrix in
+ * X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination matrix in
+ * Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in
+ * the destination matrix
+ * @param[in]  lhs_stride_z                                     Stride of the LHS reshaped matrix in
+ * Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                                     Stride of the RHS reshaped matrix in
+ * Z dimension (in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the destination tensor in
+ * Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                              (Optional) Bottom paddings for LHS
+ * matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                              (Optional) Bottom paddings for the
+ * output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: S32
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: S32
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases
+ * tensor. Supported data type: S32
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases
+ * tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first
+ * element in the biases tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output
+ * multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output
+ * multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional)
+ * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output
+ * shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output
+ * shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x *
+ * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first
+ * element in the output shifts vector
+ */
+__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(
+  IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), IMAGE_DECLARATION(dst), uint lhs_stride_z,
+  uint rhs_stride_z, uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+  ,
+  uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  ,
+  uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+#if defined(A_OFFSET)
+  ,
+  IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+  IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+    ,
+  VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS)
+#if defined(PER_CHANNEL_QUANTIZATION)
+    ,
+  VECTOR_DECLARATION(result_multipliers), VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+)
+{
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X +
+                    (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+  for (int i = 0; i < K; i += K0)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
+
+    // Partial matrix multiplication M0,N0,K0
+    ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+
+    lhs_offset += K0;
+    rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+  }
+
+  // Result of MM is of type DATA_TYPE
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0) * sizeof(DATA_TYPE) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Convert result of matrix multiplication to S32
+  REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_int);
+
+  int batch_id = z;
+#if defined(DEPTH_GEMM3D)
+  batch_id /= (int)DEPTH_GEMM3D;
+#endif // defined(DEPTH_GEMM3D)
+
+  // Offset contribution: c += (A_OFFSET * sum_col) + (B_OFFSET * sum_row) +  K_OFFSET;
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(int, N0), offset_s32_, K_OFFSET);
+
+#if defined(A_OFFSET)
+  // Compute the offset contribution due to A_OFFSET
+  __global uchar *sum_col_addr =
+    sum_col_ptr + sum_col_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
+
+#if defined(SUM_COL_HAS_BATCHES)
+  sum_col_addr += z * sum_col_stride_y;
+#endif // defined(SUM_COL_HAS_BATCHES)
+  VEC_DATA_TYPE(int, N0)
+  a_offset_s32 = VLOAD(N0)(0, (__global int *)sum_col_addr);
+  a_offset_s32 *= (VEC_DATA_TYPE(int, N0))A_OFFSET;
+
+  REPEAT_ADD_VECTOR_TO_VAR(M0, offset_s32_, a_offset_s32);
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+  // Compute the offset contribution due to B_OFFSET
+  __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes +
+                                 (y * (uint)M0) * sizeof(int) + z * sum_row_stride_y;
+
+#if defined(HEIGHT_GEMM3D) && defined(DEPTH_GEMM3D)
+  sum_row_addr += (batch_id % (int)DEPTH_GEMM3D) * (int)HEIGHT_GEMM3D * sizeof(int);
+#endif // defined(HEIGHT_GEMM3D) && defined(DEPTH_GEMM3D)
+  LOAD_SCALAR_AS_VECTOR(M0, N0, int, b_offset_s32_, sum_row_addr, 0, sum_row_stride_x);
+
+  REPEAT_MLA_VAR_WITH_CONST_VEC(M0, offset_s32_, b_offset_s32_, (VEC_DATA_TYPE(int, N0))B_OFFSET);
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr =
+    biases_ptr + biases_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
+
+  VEC_DATA_TYPE(int, N0)
+  bias_values = VLOAD(N0)(0, (__global int *)bias_addr);
+  REPEAT_ADD_VECTOR_TO_VAR(M0, offset_s32_, bias_values);
+#endif // defined(ADD_BIAS)
+
+  REPEAT_ADD_TWO_VARS(M0, c_int, offset_s32_);
+
+  // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+  __global uchar *result_multipliers_addr = result_multipliers_ptr +
+                                            result_multipliers_offset_first_element_in_bytes +
+                                            (x * (uint)N0) * sizeof(int);
+  __global uchar *result_shifts_addr =
+    result_shifts_ptr + result_shifts_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
+
+  VEC_DATA_TYPE(int, N0)
+  res_mul = VLOAD(N0)(0, (__global int *)result_multipliers_addr);
+  VEC_DATA_TYPE(int, N0)
+  res_shift = VLOAD(N0)(0, (__global int *)result_shifts_addr);
+
+  REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(M0, N0, c_int, res_mul, res_shift);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+
+#if RESULT_SHIFT < 0
+  REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(M0, N0, c_int, RESULT_MULTIPLIER,
+                                                         RESULT_SHIFT);
+#else  // RESULT_SHIFT >= 0
+  REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(M0, N0, c_int, RESULT_MULTIPLIER,
+                                                      RESULT_SHIFT);
+#endif // RESULT_SHIFT < 0
+
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+  // Add the offset terms to GEMM's result
+  REPEAT_ADD_CONST_TO_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, RESULT_OFFSET);
+
+#if defined(MIN_BOUND)
+  REPEAT_MAX_CONST_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  REPEAT_MIN_CONST_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Convert and store output block (does convert saturate)
+  CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c_int, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) &&
+       // defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(K)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is NOT reshaped
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e.
+ * -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of N0 columns to process must be passed at compile time using -DN0 (i.e. -DN0=2)
+ * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (i.e.,
+ * -DK0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data
+ * type: QASYMM8
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped
+ * matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped
+ * matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in
+ * unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+                                 IMAGE_DECLARATION(dst), uint lhs_stride_z, uint rhs_stride_z,
+                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                 ,
+                                 uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                 ,
+                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+  int i = 0;
+
+  for (; i <= (K - K0); i += K0)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+
+    // Partial matrix multiplication M0,N0,K0
+#if (GPU_ARCH == GPU_ARCH_MIDGARD)
+    ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+       // Transpose the values from RHS matrix
+    TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE);
+
+    ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+    // Update the offset
+    lhs_offset += K0;
+    rhs_offset += K0 * rhs_stride_y;
+  }
+
+  // Left-over for loop
+  for (; i < K; ++i)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+
+    // Partial matrix multiplication M0,N0,1
+#if (GPU_ARCH == GPU_ARCH_MIDGARD)
+    ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+       // Transpose the values from RHS matrix
+    TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE);
+
+    ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+    // Update the offset
+    lhs_offset += 1;
+    rhs_offset += rhs_stride_y;
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Convert and store output block
+  CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(K)
+
+#if defined(COLS_A)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix
+ * A. It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at
+ * compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE
+ * (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g.
+ * -DSCALAR=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type:
+ * QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+  sum_row_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))0;
+  ACC_DATA_TYPE sum_row = 0;
+
+  __global const DATA_TYPE *matrix_a =
+    (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y +
+                                 get_global_id(1) * src_stride_z);
+
+  int i = 0;
+
+  // This for loop performs 16 accumulations
+  for (; i <= ((int)COLS_A - 16); i += 16)
+  {
+    const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i);
+
+    sum_row_32 += CONVERT(a0.s0123, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) +
+                  CONVERT(a0.s4567, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) +
+                  CONVERT(a0.s89AB, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) +
+                  CONVERT(a0.sCDEF, VEC_DATA_TYPE(ACC_DATA_TYPE, 4));
+  }
+
+  // This for loop performs the leftover accumulations
+  for (; i < COLS_A; ++i)
+  {
+    sum_row += (ACC_DATA_TYPE)matrix_a[i];
+  }
+
+  sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3;
+
+#if defined(SCALAR)
+  sum_row *= (int)SCALAR;
+#endif // defined(SCALAR)
+  *((__global int *)dst.ptr) = (int)sum_row;
+}
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A
+ * using the arm dot product instruction. It is also possible to multiply each reduced row by a
+ * scalar value, if SCALAR is passed at compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE
+ * (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g.
+ * -DSCALAR=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type:
+ * QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  ACC_DATA_TYPE sum_row = 0;
+
+  __global const DATA_TYPE *matrix_a =
+    (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y +
+                                 get_global_id(1) * src_stride_z);
+
+  int i = 0;
+
+  // This for loop performs 16 accumulations
+  for (; i <= ((int)COLS_A - 32); i += 32)
+  {
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    a0 = vload16(0, matrix_a + i);
+
+    sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+
+    a0 = vload16(1, matrix_a + i);
+
+    sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+  }
+
+  // This for loop performs the leftover accumulations
+  for (; i < COLS_A; ++i)
+  {
+    sum_row += (ACC_DATA_TYPE)matrix_a[i];
+  }
+
+#if defined(SCALAR)
+  sum_row *= (int)SCALAR;
+#endif // defined(SCALAR)
+  *((__global int *)dst.ptr) = (int)sum_row;
+}
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#endif // defined(COLS_A)
+
+#if defined(COLS_B) && defined(ROWS_B)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of
+ * Matrix B. It is also possible to multiply each reduced column by a scalar value, if SCALAR is
+ * passed at compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix B columns and rows needs to be passed at compile time using
+ * -DCOLS_B and -DROWS_B
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE
+ * (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (i.e.
+ * -DSCALAR=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type:
+ * QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * tensor
+ */
+__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
+  sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 16))0;
+
+  __global const DATA_TYPE *matrix_b =
+    (__global const DATA_TYPE *)(src.ptr + get_global_id(1) * src_stride_z);
+
+  int i = 0;
+  // This for loop performs 4 accumulations
+  for (; i <= ((int)ROWS_B - 4); i += 4)
+  {
+    const VEC_DATA_TYPE(DATA_TYPE, 16) b0 = vload16(0, matrix_b + 0 * src_stride_y);
+    const VEC_DATA_TYPE(DATA_TYPE, 16) b1 = vload16(0, matrix_b + 1 * src_stride_y);
+    const VEC_DATA_TYPE(DATA_TYPE, 16) b2 = vload16(0, matrix_b + 2 * src_stride_y);
+    const VEC_DATA_TYPE(DATA_TYPE, 16) b3 = vload16(0, matrix_b + 3 * src_stride_y);
+
+    sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) +
+                  CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) +
+                  CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) +
+                  CONVERT(b3, VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+
+    matrix_b += 4 * src_stride_y;
+  }
+
+  // This for loop perfoms the leftover accumulations
+  for (; i < (int)ROWS_B; ++i)
+  {
+    const VEC_DATA_TYPE(DATA_TYPE, 16) b0 = vload16(0, matrix_b);
+
+    sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+
+    matrix_b += src_stride_y;
+  }
+
+#if defined(SCALAR)
+  sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, 16))SCALAR;
+#endif // defined(SCALAR)
+  VSTORE(16)
+  (convert_int16(sum_col_32), 0, (__global int *)dst.ptr);
+}
+#endif // defined(COLS_B) && defined(ROWS_B)
+
+#endif // defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
+
+#if defined(K_OFFSET)
+
+/* Helper function used to calculate the offset contribution after matrix multiplication.
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
+ * and calculates the offset contribution of matrix A and matrix B.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns)
+ * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at
+ * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at
+ * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually
+ * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * @param[in] x                                     get_global_id(0) * 4
+ * @param[in] y                                     get_global_id(1)
+ * @param[in] z                                     get_global_id(2)
+ * @param[in] sum_col_ptr                           (Optional) Pointer to the source tensor.
+ * Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x                      (Optional) Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] sum_col_step_x                        (Optional) sum_col_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y                      (Optional) Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] sum_col_step_y                        (Optional) sum_col_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the source tensor
+ * @param[in] sum_row_ptr                           (Optional) Pointer to the source tensor.
+ * Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x                      (Optional) Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] sum_row_step_x                        (Optional) sum_row_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y                      (Optional) Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] sum_row_step_y                        (Optional) sum_row_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the source tensor
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in
+ * the biases tensor
+ */
+inline int4 offset_contribution(int x, int y, int z
+#if defined(A_OFFSET)
+                                ,
+                                IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                  ,
+                                IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                  ,
+                                VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS)
+)
+{
+  int4 a_offset_s32 = (int4)0;
+  int4 b_offset_s32 = (int4)0;
+
+  int batch_id = z;
+#if defined(DEPTH_INPUT3D)
+  batch_id /= (int)DEPTH_INPUT3D;
+#endif // defined(DEPTH_INPUT3D)
+
+#if defined(A_OFFSET)
+  // Compute the offset contribution due to A_OFFSET
+  __global uchar *sum_col_addr =
+    sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
+
+  // Compute the offset contribution due to A_OFFSET
+#if defined(SUM_COL_HAS_BATCHES)
+  a_offset_s32 = vload4(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
+#else  // defined(SUM_COL_HAS_BATCHES)
+  a_offset_s32 = vload4(0, (__global int *)sum_col_addr);
+#endif // defined(SUM_COL_HAS_BATCHES)
+
+  a_offset_s32 *= (int4)A_OFFSET;
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+  // Compute the offset contribution due to A_OFFSET
+  __global uchar *sum_row_addr =
+    sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
+
+  // Compute the offset contribution due to B_OFFSET
+#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+  b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) +
+                           (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
+#else  // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+  b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
+#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+  b_offset_s32 *= (int4)B_OFFSET;
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+  int4 biases_values = vload4(0, (__global int *)bias_addr);
+  b_offset_s32 += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+  return (int4)K_OFFSET + a_offset_s32 + b_offset_s32;
+}
+
+/* OpenCL kernel used to add the offset contribution after matrix multiplication. The computation is
+ * performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns)
+ * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at
+ * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at
+ * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually
+ * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * @param[in] mm_result_ptr                           Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in] mm_result_stride_x                      Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] mm_result_step_x                        mm_result_stride_x * number of elements along
+ * X processed per workitem(in bytes)
+ * @param[in] mm_result_stride_y                      Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] mm_result_step_y                        mm_result_stride_y * number of elements along
+ * Y processed per workitem(in bytes)
+ * @param[in] mm_result_stride_z                      Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] mm_result_step_z                        mm_result_stride_z * number of elements along
+ * Z processed per workitem(in bytes)
+ * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[in] sum_col_ptr                             (Optional) Pointer to the source tensor.
+ * Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x                        (Optional) Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] sum_col_step_x                          (Optional) sum_col_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y                        (Optional) Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] sum_col_step_y                          (Optional) sum_col_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes   (Optional) The offset of the first element in
+ * the source tensor
+ * @param[in] sum_row_ptr                             (Optional) Pointer to the source tensor.
+ * Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x                        (Optional) Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] sum_row_step_x                          (Optional) sum_row_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y                        (Optional) Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] sum_row_step_y                          (Optional) sum_row_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes   (Optional) The offset of the first element in
+ * the source tensor
+ * @param[in] biases_ptr                              (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x                         (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in] biases_step_x                           (Optional) biases_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes    (Optional) The offset of the first element in
+ * the biases tensor
+ */
+__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                             ,
+                                           IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                             ,
+                                           IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                             ,
+                                           VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS))
+)
+{
+  const int x = get_global_id(0) * 4;
+  const int y = get_global_id(1);
+  const int z = get_global_id(2);
+
+  // Compute offset contribution
+  int4 offset_term_s32 = offset_contribution(
+    x, y, z
+#if defined(A_OFFSET)
+    ,
+    sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y,
+    sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y,
+    sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+    ,
+    biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+  );
+
+  __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes +
+                                   x * sizeof(int) + y * mm_result_stride_y +
+                                   z * mm_result_stride_z;
+
+  int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+  // Add the offset terms to GEMM's result
+  in_s32 += offset_term_s32;
+
+  // Store the result with the offset contribution
+  vstore4(in_s32, 0, (__global int *)mm_result_addr);
+}
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && \
+  defined(OUTPUT_DATA_TYPE)
+/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and
+ * it quantizes down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of
+ * @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and
+ * quantizes to uint8 through the output stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns)
+ * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at
+ * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at
+ * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually
+ * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * This result is quantized down to uint8/int8 using the output stage. The output stage computes the
+ * following operations:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND
+ * are passed at compile time)
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  mm_result_ptr                                    Pointer to the source tensor.
+ * Supported data type: S32
+ * @param[in]  mm_result_stride_x                               Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of
+ * elements along Z processed per workitem(in bytes)
+ * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in
+ * the source tensor
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases
+ * tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases
+ * tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first
+ * element in the biases tensor
+ * @param[out] dst_ptr                                          Pointer to the destination tensor
+ * Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                                     Stride of the destination tensor in
+ * X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination tensor in
+ * Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in]  dst_step_z                                       src_stride_z * number of elements
+ * along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in
+ * the destination tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output
+ * multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output
+ * multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional)
+ * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output
+ * shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output
+ * shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x *
+ * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first
+ * element in the output shifts vector
+ */
+__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                                           ,
+                                                         IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                                           ,
+                                                         IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                                           ,
+#if defined(ADD_BIAS)
+                                                         VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                         TENSOR3D_DECLARATION(dst)
+#if defined(PER_CHANNEL_QUANTIZATION)
+                                                           ,
+                                                         VECTOR_DECLARATION(result_multipliers),
+                                                         VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+)
+{
+  const int x = get_global_id(0) * 4;
+  const int y = get_global_id(1);
+  const int z = get_global_id(2);
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+  // Compute offset contribution
+  int4 offset_term_s32 = offset_contribution(
+    x, y, z
+#if defined(A_OFFSET)
+    ,
+    sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y,
+    sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y,
+    sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+    ,
+    biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+  );
+
+  __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes +
+                                   x * sizeof(int) + y * mm_result_stride_y +
+                                   z * mm_result_stride_z;
+
+  int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+  // Add the offset terms to GEMM's result
+  in_s32 += offset_term_s32;
+
+  // -------------- OUTPUT STAGE
+
+  // Add the offset terms to GEMM's result
+  in_s32 += (int4)RESULT_OFFSET;
+
+  // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+  __global uchar *result_multipliers_addr =
+    result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
+  __global uchar *result_shifts_addr =
+    result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
+  int4 result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr);
+  int4 result_shifts_values = vload4(0, (__global int *)result_shifts_addr);
+
+  in_s32 *= result_multipliers_values;
+  in_s32 >>= result_shifts_values;
+#else  // defined(PER_CHANNEL_QUANTIZATION)
+  in_s32 *= RESULT_MULTIPLIER;
+
+  in_s32 >>= RESULT_SHIFT;
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+  VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+  res = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+  res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+
+/* OpenCL kernel used to add the offset contribution after matrix multiplication and it quantizes
+ * down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), adds to
+ * it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output
+ * stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns)
+ * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at
+ * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at
+ * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually
+ * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * This result is quantized down to uint8/int8 using the output stage. The output stage computes the
+ * following operations:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by
+ * result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  mm_result_ptr                                    Pointer to the source tensor.
+ * Supported data type: S32
+ * @param[in]  mm_result_stride_x                               Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of
+ * elements along Z processed per workitem(in bytes)
+ * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in
+ * the source tensor
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases
+ * tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases
+ * tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first
+ * element in the biases tensor
+ * @param[out] dst_ptr                                          Pointer to the destination tensor
+ * Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                                     Stride of the destination tensor in
+ * X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination tensor in
+ * Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in]  dst_step_z                                       src_stride_z * number of elements
+ * along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in
+ * the destination tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output
+ * multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output
+ * multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional)
+ * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output
+ * shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output
+ * shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x *
+ * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first
+ * element in the output shifts vector
+ */
+__kernel void
+  gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                                          ,
+                                                        IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                                          ,
+                                                        IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                                          ,
+#if defined(ADD_BIAS)
+                                                        VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                        TENSOR3D_DECLARATION(dst)
+#if defined(PER_CHANNEL_QUANTIZATION)
+                                                          ,
+                                                        VECTOR_DECLARATION(result_multipliers),
+                                                        VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+  )
+{
+  const int x = get_global_id(0) * 4;
+  const int y = get_global_id(1);
+  const int z = get_global_id(2);
+
+  // Compute offset contribution
+  int4 offset_term_s32 = offset_contribution(
+    x, y, z
+#if defined(A_OFFSET)
+    ,
+    sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y,
+    sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y,
+    sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+    ,
+    biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+  );
+
+  __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes +
+                                   x * sizeof(int) + y * mm_result_stride_y +
+                                   z * mm_result_stride_z;
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+  int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+  // Add the offset terms to GEMM's result
+  in_s32 += offset_term_s32;
+
+  // -------------- OUTPUT STAGE
+
+  // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+  __global uchar *result_multipliers_addr =
+    result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
+  __global uchar *result_shifts_addr =
+    result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
+  int4 result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr);
+  int4 result_shifts_values = vload4(0, (__global int *)result_shifts_addr);
+
+  int4 in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(
+    in_s32, result_multipliers_values, result_shifts_values, 4);
+  int4 in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(
+    in_s32, result_multipliers_values, result_shifts_values, 4);
+  in_s32 = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+
+#if RESULT_SHIFT < 0
+  in_s32 =
+    ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+#else  // RESULT_SHIFT >= 0
+  in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+  // Add the offset terms to GEMM's result
+  in_s32 += (int4)RESULT_OFFSET;
+
+  VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+  res = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+  res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) &&
+       // defined(OUTPUT_DATA_TYPE)
+
+#endif // defined(K_OFFSET)
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to
+ * QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value and processes it to obtain the final
+ * QASYMM8/QASYMM8_SIGNED value. The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND
+ * are passed at compile time)
+ *  -# Clamp the resulting int32 values:
+ *  -#  - to the [0..255] range and cast to QASYMM8.
+ *  -#  - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source
+ * tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data
+ * type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the
+ * destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                  VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                  TENSOR3D_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  int x = get_global_id(0) * 4;
+  int y = get_global_id(1);
+  int z = get_global_id(2);
+
+  __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) +
+                             y * src_stride_y + z * src_stride_z;
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+  int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+  int4 biases_values = vload4(0, (__global int *)bias_addr);
+  input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+  // Add the offset terms to GEMM's result
+  input_values += (int4)RESULT_OFFSET;
+
+  // Multiply by result_mult_int and shift
+  input_values *= RESULT_MULT_INT;
+
+#if RESULT_SHIFT < 0
+  input_values >>= -RESULT_SHIFT;
+#else  // RESULT_SHIFT >= 0
+  input_values >>= RESULT_SHIFT;
+#endif // RESULT_SHIFT < 0
+
+  VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+  res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+  res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+
+#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && \
+  defined(RESULT_SHIFT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to
+ * QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and
+ * processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. The following computations will be
+ * performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by
+ * result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER
+ * and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source
+ * tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data
+ * type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the
+ * destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                             VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                             TENSOR3D_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  int x = get_global_id(0) * 4;
+  int y = get_global_id(1);
+  int z = get_global_id(2);
+
+  __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) +
+                             y * src_stride_y + z * src_stride_z;
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+  int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+  int4 biases_values = vload4(0, (__global int *)bias_addr);
+  input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+  // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+  input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(
+    input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#else  // RESULT_SHIFT >= 0
+  input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(
+    input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
+  // Add the offset terms to GEMM's result
+  input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;
+
+  VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+  res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+  res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) &&
+       // defined(RESULT_SHIFT)
+
+#if defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and
+ * processes it to obtain the final QSYMM16 value. The following computations will be performed by
+ * the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by
+ * result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [-32768..32767] range and cast to QSYMM16.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source
+ * tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data
+ * type: QSYMM16
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the
+ * destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                                     VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                                     TENSOR3D_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  int x = get_global_id(0) * 4;
+  int y = get_global_id(1);
+  int z = get_global_id(2);
+
+  __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) +
+                             y * src_stride_y + z * src_stride_z;
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x * 2 + y * dst_stride_y + z * dst_stride_z;
+
+  int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+  int4 biases_values = vload4(0, (__global int *)bias_addr);
+  input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+  // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+  input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(
+    input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#else  // RESULT_SHIFT >= 0
+  input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(
+    input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
+  short4 res = convert_short4_sat(input_values);
+
+#if defined(MIN_BOUND)
+  res = max(res, (short4)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (short4)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global short *)dst_addr);
+}
+#endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to
+ * QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and
+ * processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. The following computations will be
+ * performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by
+ * result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Requantize
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset and scalar scale factor must be passed at compile time using
+ * -DRESULT_OFFSET, -DREAL_MULTIPLIER
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source
+ * tensor
+ * @param[in]  biases_ptr                           Pointer to the biases tensor. Supported data
+ * type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      Stride of the biases tensor in X dimension (in
+ * bytes)
+ * @param[in]  biases_step_x                        biases_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes The offset of the first element in the biases
+ * tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data
+ * type: QASYMM8
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                         Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in]  dst_step_w                           src_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the
+ * destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                        VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+#if defined(DST_HEIGHT)
+                                                        TENSOR4D_DECLARATION(dst))
+#else  // defined(DST_HEIGHT)
+                                                        TENSOR3D_DECLARATION(dst))
+#endif // defined(DST_HEIGHT)
+{
+  // Compute source and destination addresses
+  int x = get_global_id(0) * 4;
+  int y = get_global_id(1);
+  int z = get_global_id(2);
+
+  __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) +
+                             y * src_stride_y + z * src_stride_z;
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+  int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+  int4 biases_values = vload4(0, (__global int *)bias_addr);
+  input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+  // Convert to float
+  float4 input_values_f = convert_float4(input_values);
+  input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
+
+  VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+  res = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+  res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl
new file mode 100644
index 000000000..51919c8a5
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(CONSTANT_VALUE) // Check for compile time constants
+
+/** Fill the tensor's planes with all value
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DCONSTANT_VALUE = The value use to fill the tensor's planes
+ * -# -DVEC_SIZE = Vector size
+ * -# -DLAST_ACCESSED_X = The element that is on the X border (threads trying to set this, might
+ * need to step back a bit)
+ *
+ * @param[in] tensor_ptr                           Pointer to the source image. Data types
+ * supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] tensor_stride_x                      Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] tensor_step_x                        tensor_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] tensor_stride_y                      Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] tensor_step_y                        tensor_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] tensor_offset_first_element_in_bytes The offset of the first element in the source
+ * image
+ * @param[in] value                                The value used to fill the pages of the tensor
+ */
+__kernel void memset(TENSOR3D_DECLARATION(tensor))
+{
+  Tensor3D tensor = CONVERT_TO_TENSOR3D_STRUCT(tensor);
+
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
+  // Check if access on width gets out of bounds
+  // If it does shift access vector to access elements within bounds
+  const int xi = (int)(get_global_id(0) * VEC_SIZE);
+  tensor.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * tensor_stride_x;
+#endif // defined(LAST_ACCESSED_X)
+
+  VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+  data = (DATA_TYPE)(CONSTANT_VALUE);
+
+  VSTORE(VEC_SIZE)
+  (data, 0, (__global DATA_TYPE *)tensor.ptr);
+#else  // !defined(VEC_SIZE)
+  *((__global DATA_TYPE *)(tensor.ptr)) = (DATA_TYPE)(CONSTANT_VALUE);
+#endif // defined(VEC_SIZE)
+}
+
+#endif // Check for compile time constants
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl
new file mode 100644
index 000000000..96f2f9ef0
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && \
+  defined(SRC_WIDTH)
+
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_SELECT VEC_DATA_TYPE(SELECT_DT, VEC_SIZE)
+#define OFFSETS VEC_OFFS(VEC_SELECT, VEC_SIZE)
+
+#if defined(CONST_VAL)
+/** Perform a pad operation when PaddingMode is CONSTANT
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
+ * @note Constant value used to fill the pads must be passed using the -DCONST_VAL compile flag,
+ * e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g.
+ * -DPAD_X_BEFORE=5
+ * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g.
+ * -DSRC_WIDTH=224
+ * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile
+ * flag, e.g. -DSELECT_DT=float
+ * @note In case pad left is more than the vector size, the number of threads to skip along the X
+ * axis must be passed using the -DNUM_THREADS_TO_SKIP_X compile flag, e.g.
+ * -DNUM_THREADS_TO_SKIP_X=1. This is defined as (PAD_X_BEFORE / VEC_SIZE)
+ * @note If pad also needs to be added to the top of the tensor, the following compile flags must be
+ * passed at compile time:
+ *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
+ *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
+ * @note If pad also needs to be added to the depth of the tensor, the following compile flags must
+ * be passed at compile time:
+ *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g.
+ * -DPAD_Z_BEFORE=3)
+ *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
+ * @note If pad also needs to be added to the batch of the tensor, the following compile flags must
+ * be passed at compile time:
+ *       -# -DPAD_W_BEFORE: Pad to add before the first batch of the input tensor (e.g.
+ * -DPAD_W_BEFORE=3)
+ *       -# -DSRC_BATCH: Input tensor's batch size (e.g. -DSRC_BATCH=4)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types:
+ * U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * image
+ * @param[in]  batch                             (Optional) Batch index if 4D pad must be applied
+ */
+__kernel void pad_layer_constant(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)
+#if defined(PAD_W_BEFORE)
+                                                              ,
+                                 uint batch
+#endif // defined(PAD_W_BEFORE)
+)
+{
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+  const int z = get_global_id(2);
+
+  uint cond = 0;
+
+#if defined(PAD_W_BEFORE)
+  cond |= batch < PAD_W_BEFORE || batch >= (SRC_BATCH + PAD_W_BEFORE);
+#endif // defined(PAD_W_BEFORE)
+#if defined(PAD_Z_BEFORE)
+  cond |= z < PAD_Z_BEFORE || z >= (SRC_DEPTH + PAD_Z_BEFORE);
+#endif // defined(PAD_Z_BEFORE)
+
+  if (cond)
+  {
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    VSTORE(VEC_SIZE)
+    ((VEC_TYPE)CONST_VAL, 0, (__global DATA_TYPE *)dst.ptr);
+  }
+  else
+  {
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(NUM_THREADS_TO_SKIP_X)
+    /* In case the pad left is greater than the vector size, and we are past the threads operating
+     * solely on pad values, the input pointer must be brought back along the X axis to start from
+     * the first non-pad values.
+     *
+     * E.g. with VEC_SIZE=2, PAD_X_BEFORE=5, CONST_VAL=0 and 1D input |1 2 3 4 5 6|:
+     *  -# The first thread will compute the output values |0 0| since it detects (x_outs == (0, 1))
+     * < PAD_X_BEFORE
+     *  -# The second thread will compute the output values |0 0| since it detects (x_outs == (2,
+     * 3)) < PAD_X_BEFORE
+     *  -# The third thread should compute |0 1|, however the input pointer is now ahead of ((x *
+     * VEC_SIZE) == 4) values, reading |4 5|
+     *  -# To detect this, we use ((PAD_X_BEFORE / VEC_SIZE) == NUM_THREADS_TO_SKIP_X == 2) and
+     * check that it is >= to the current x
+     *  -# So, we bring the pointer back of NUM_THREADS_TO_SKIP_X threads, which means multiplying
+     * this constant by the input's step along the X axis
+     *  -# Now that the pointer is back of ((NUM_THREADS_TO_SKIP_X * src_step_x) == 4) values, it
+     * will read the desired values |0 1|
+     */
+    src.ptr -= select(0u, NUM_THREADS_TO_SKIP_X * src_step_x, x >= NUM_THREADS_TO_SKIP_X);
+#endif // defined(NUM_THREADS_TO_SKIP_X)
+#if defined(PAD_Z_BEFORE)
+    src.ptr -= PAD_Z_BEFORE * src_step_z;
+#endif // defined(PAD_Z_BEFORE)
+#if defined(PAD_W_BEFORE)
+    src.ptr -= PAD_W_BEFORE * SRC_DEPTH * src_step_z;
+#endif // defined(PAD_W_BEFORE)
+
+    VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+
+    VEC_INT xs_out = (VEC_INT)(x * VEC_SIZE) + CONVERT(OFFSETS, VEC_INT);
+    VEC_INT cond = xs_out < (VEC_INT)PAD_X_BEFORE || xs_out >= (VEC_INT)(SRC_WIDTH + PAD_X_BEFORE);
+#if defined(PAD_Y_BEFORE)
+    cond |=
+      (VEC_INT)y < (VEC_INT)PAD_Y_BEFORE || (VEC_INT)y >= (VEC_INT)(SRC_HEIGHT + PAD_Y_BEFORE);
+#endif // defined(PAD_Y_BEFORE)
+    VSTORE(VEC_SIZE)
+    (select(src_vals, (VEC_TYPE)CONST_VAL, CONVERT(cond, VEC_SELECT)), 0,
+     (__global DATA_TYPE *)dst.ptr);
+  }
+}
+#endif // defined(CONST_VAL)
+
+#if defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) &&         \
+  defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && \
+  defined(AFTER_PAD_FACT_X)
+
+#define SCALAR_COND(x) (VEC_SELECT) x == (VEC_SELECT)1
+#define ROTATE_REVERSE(x, n) ROTATE(REVERSE(x, VEC_SIZE), VEC_SIZE, n)
+#define SYMM_REFL_LEFT(x, n0, n1) \
+  select(ROTATE_REVERSE(x, n1), ROTATE(x, VEC_SIZE, n0), OFFSETS >= (VEC_SELECT)n0)
+#define SYMM_REFL_RIGHT(x, n0, n1) \
+  select(ROTATE(x, VEC_SIZE, n0), ROTATE_REVERSE(x, n1), OFFSETS >= (VEC_SELECT)n0)
+
+/** Perform a pad operation when PaddingMode is SYMMETRIC
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
+ * @note Constant value must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g.
+ * -DPAD_X_BEFORE=5
+ * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g.
+ * -DSRC_WIDTH=224
+ * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile
+ * flag, e.g. -DSELECT_DT=float
+ * @note Number of values to the left when operating across left padding must be passed using the
+ * -DPAD_X_BEFORE_REMAINDER compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=5
+ * @note Number of values to the left when operating across right padding must be passed using the
+ * -DPAD_X_AFTER_REMAINDER compile flag, e.g. -DPAD_X_AFTER_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_BEFORE_REMAINDER + 1) must be passed when mode is
+ * REFLECT using the -DPAD_X_BEFORE_REMAINDER_REFL compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_AFTER_REMAINDER - 1) must be passed using the
+ * -DPAD_X_AFTER_REMAINDER_REFL compile flag, e.g. -DPAD_X_AFTER_REMAINDER=5
+ * @note When after pad X, starting point to read backward from must be passed using the
+ * -DAFTER_PAD_FACT_X compile flag, e.g. -DAFTER_PAD_FACT_X=253
+ * @note If padding mode is REFLECT, the -DIS_REFLECT compile flag must be set to 1, else it must be
+ * set to 0
+ * @note If pad also needs to be added to the top of the tensor, the following compile flags must be
+ * passed at compile time:
+ *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
+ *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
+ * @note If pad also needs to be added to the depth of the tensor, the following compile flags must
+ * be passed at compile time:
+ *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g.
+ * -DPAD_Z_BEFORE=3)
+ *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
+ * @note If the starting point to read backward from is less than the output's last element accessed
+ * in the X, the following compile flags must be passed at compile time to avoid negative offsets:
+ *       -# -DAFTER_PAD_REM: Defines how much to rotate the vector if the backward calculation
+ * attempted to read from a negative offset (e.g. -DAFTER_PAD_REM=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types:
+ * U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * image
+ */
+__kernel void pad_layer_symmetric_reflect(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+  // Get current thread position
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+  const int z = get_global_id(2);
+
+  // Define conditions based on the thread X position w.r.t. pad left and right
+  const int x_out_first = x * VEC_SIZE;
+  const int x_out_last = x_out_first + VEC_SIZE;
+  const int is_before_pad_left = (x_out_last <= PAD_X_BEFORE);
+  const int is_across_pad_left = (x_out_first < PAD_X_BEFORE) && (x_out_last > PAD_X_BEFORE);
+  const int is_inside_input =
+    (x_out_first >= PAD_X_BEFORE) && (x_out_last <= (SRC_WIDTH + PAD_X_BEFORE));
+  const int is_across_pad_right =
+    (x_out_first < (SRC_WIDTH + PAD_X_BEFORE)) && (x_out_last > (SRC_WIDTH + PAD_X_BEFORE));
+  const int is_after_pad_right = (x_out_first >= (SRC_WIDTH + PAD_X_BEFORE));
+
+  // Calculate base pointers
+  __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes;
+  Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+  // Calculate input tensor's offset based on the defined conditions
+  int x_offset = 0;
+  x_offset = select(x_offset, PAD_X_BEFORE - x_out_last + IS_REFLECT, is_before_pad_left);
+  x_offset = select(x_offset, x_out_first - PAD_X_BEFORE, is_inside_input);
+  x_offset = select(x_offset, SRC_WIDTH - VEC_SIZE, is_across_pad_right);
+  x_offset = select(x_offset, AFTER_PAD_FACT_X - x_out_last, is_after_pad_right);
+
+#if defined(AFTER_PAD_REM)
+  int neg_offs = x_offset < 0;
+  x_offset = max(x_offset, 0);
+#endif // defined(AFTER_PAD_REM)
+
+  // Load input values from the computed offset
+  int y_in = y;
+  int z_in = z;
+#if defined(PAD_Y_BEFORE)
+  y_in = select(y - PAD_Y_BEFORE, PAD_Y_BEFORE - y + IS_REFLECT - 1, y < PAD_Y_BEFORE);
+  y_in = select(y_in, 2 * SRC_HEIGHT + PAD_Y_BEFORE - y - IS_REFLECT - 1,
+                y >= (SRC_HEIGHT + PAD_Y_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+#if defined(PAD_Z_BEFORE)
+  z_in = select(z - PAD_Z_BEFORE, PAD_Z_BEFORE - z + IS_REFLECT - 1, z < PAD_Z_BEFORE);
+  z_in = select(z_in, 2 * SRC_DEPTH + PAD_Z_BEFORE - z - IS_REFLECT - 1,
+                z >= (SRC_DEPTH + PAD_Z_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+
+  src_addr += x_offset * src_stride_x + y_in * src_step_y + z_in * src_step_z;
+
+#if SRC_WIDTH == 1
+  VSTORE(VEC_SIZE)
+  ((VEC_TYPE)(*(__global DATA_TYPE *)src_addr), 0, (__global DATA_TYPE *)dst.ptr);
+#else // SRC_WIDTH == 1
+
+  VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+  // Choose rearrangement policy based on the defined conditions
+  src_vals =
+    select(src_vals, SYMM_REFL_LEFT(src_vals, PAD_X_BEFORE_REMAINDER, PAD_X_BEFORE_REMAINDER_REFL),
+           SCALAR_COND(is_across_pad_left));
+  src_vals =
+    select(src_vals, SYMM_REFL_RIGHT(src_vals, PAD_X_AFTER_REMAINDER, PAD_X_AFTER_REMAINDER_REFL),
+           SCALAR_COND(is_across_pad_right));
+  src_vals = select(src_vals, REVERSE(src_vals, VEC_SIZE),
+                    SCALAR_COND((is_before_pad_left || is_after_pad_right)));
+#if defined(AFTER_PAD_REM)
+  src_vals = select(src_vals, ROTATE(src_vals, VEC_SIZE, AFTER_PAD_REM), SCALAR_COND(neg_offs));
+#endif // defined(AFTER_PAD_REM)
+
+  // Store
+  VSTORE(VEC_SIZE)
+  (src_vals, 0, (__global DATA_TYPE *)dst.ptr);
+#endif // SRC_WIDTH == 1
+}
+#endif // defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) &&
+       // defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) &&
+       // defined(AFTER_PAD_FACT_X)
+#endif // defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) &&
+       // defined(SRC_WIDTH)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h
new file mode 100644
index 000000000..cfc811cce
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_REPEAT_H
+#define ARM_COMPUTE_REPEAT_H
+
+#include "helpers.h"
+
+/** Macros that help in loop unrolling */
+// Repeat macros with 3 param, excluding the implicit ID param
+#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C)
+#define REPEAT_3_2(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(1, P_A, P_B, P_C);         \
+  REPEAT_3_1(P_X, P_A, P_B, P_C)
+#define REPEAT_3_3(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(2, P_A, P_B, P_C);         \
+  REPEAT_3_2(P_X, P_A, P_B, P_C)
+#define REPEAT_3_4(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(3, P_A, P_B, P_C);         \
+  REPEAT_3_3(P_X, P_A, P_B, P_C)
+#define REPEAT_3_5(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(4, P_A, P_B, P_C);         \
+  REPEAT_3_4(P_X, P_A, P_B, P_C)
+#define REPEAT_3_6(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(5, P_A, P_B, P_C);         \
+  REPEAT_3_5(P_X, P_A, P_B, P_C)
+#define REPEAT_3_7(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(6, P_A, P_B, P_C);         \
+  REPEAT_3_6(P_X, P_A, P_B, P_C)
+#define REPEAT_3_8(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(7, P_A, P_B, P_C);         \
+  REPEAT_3_7(P_X, P_A, P_B, P_C)
+#define REPEAT_3_9(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(8, P_A, P_B, P_C);         \
+  REPEAT_3_8(P_X, P_A, P_B, P_C)
+#define REPEAT_3_10(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(9, P_A, P_B, P_C);          \
+  REPEAT_3_9(P_X, P_A, P_B, P_C)
+#define REPEAT_3_11(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(A, P_A, P_B, P_C);          \
+  REPEAT_3_10(P_X, P_A, P_B, P_C)
+#define REPEAT_3_12(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(B, P_A, P_B, P_C);          \
+  REPEAT_3_11(P_X, P_A, P_B, P_C)
+#define REPEAT_3_13(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(C, P_A, P_B, P_C);          \
+  REPEAT_3_12(P_X, P_A, P_B, P_C)
+#define REPEAT_3_14(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(D, P_A, P_B, P_C);          \
+  REPEAT_3_13(P_X, P_A, P_B, P_C)
+#define REPEAT_3_15(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(E, P_A, P_B, P_C);          \
+  REPEAT_3_14(P_X, P_A, P_B, P_C)
+#define REPEAT_3_16(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(F, P_A, P_B, P_C);          \
+  REPEAT_3_15(P_X, P_A, P_B, P_C)
+
+#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) \
+  REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) // One level of indirection to ensure order of expansion
+                                        // does not affect preprocessing P_NUM
+#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
+
+// Repeat macros with 4 param, excluding the implicit ID param
+#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D)
+#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(1, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_1(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(2, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_2(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(3, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_3(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(4, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_4(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(5, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_5(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(6, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_6(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(7, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_7(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(8, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_8(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(9, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_9(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(A, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_10(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(B, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_11(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(C, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_12(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(D, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_13(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(E, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_14(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(F, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_15(P_X, P_A, P_B, P_C, P_D)
+
+#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) \
+  REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) // One level of indirection to ensure order of
+                                             // expansion does not affect preprocessing P_NUM
+#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D)
+
+// Macro for initializing N variables. Generates N statements that defines VAR##N =
+// RHS_ACCESSOR_DEF(...)
+#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
+#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
+
+// Macro for initializing N variables by converting the data type. Generates N statements that
+// defines VAR##N = RHS_ACCESSOR_DEF(...)
+#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) \
+  TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
+#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) \
+  REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
+
+// Macro for adding a constant to N variables. Generates N statements that defines VAR##N
+// =RHS_ACCESSOR_DEF(...)
+#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
+#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL)
+
+// Macro for multiplying N variables (VAR_B) by a constant (VAL) and adding to other N variables
+// (VAR_A). Generates N statements that defines VAR_A##N =RHS_ACCESSOR_DEF(...)
+#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
+#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) \
+  REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL)
+
+// Macro for adding a vector to N-variables. Generates N statements that defines VAR##N
+// =RHS_ACCESSOR_DEF(...)
+#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC
+#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
+
+// Macro for adding a two N-variables. Generates N statements that defines VAR##N
+// =RHS_ACCESSOR_DEF(...)
+#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID
+#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
+
+// Macro for performing Max between a constant and N variables. Generates N statements that defines
+// VAR##N =RHS_ACCESSOR_DEF(...)
+#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
+#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL)
+
+// Macro for performing Min between a constant and N variables. Generates N statements that defines
+// VAR##N =RHS_ACCESSOR_DEF(...)
+#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
+#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL)
+
+// Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE to N variables. Generates N
+// statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+
+// Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE to N variables. Generates N
+// statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+
+// Macro for performing per-channel ASYMM_MULT_BY_QUANT_MULTIPLIER to N variables.
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  ({                                                                                      \
+    VEC_DATA_TYPE(int, N0)                                                                \
+    VAR##ID_shift_lt0 =                                                                   \
+      ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);   \
+    VEC_DATA_TYPE(int, N0)                                                                \
+    VAR##ID_shift_gt0 =                                                                   \
+      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);      \
+    VAR##ID = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0);               \
+  })
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
+
+#endif // ARM_COMPUTE_REPEAT_H
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl
new file mode 100644
index 000000000..8da8bfc8e
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform tensor reshape
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ * -DDATA_TYPE=short
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported
+ * data types: All
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension
+ * (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension
+ * (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension
+ * (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first
+ * source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported
+ * data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ * @param[in]  input_shape                          Input spatial shape
+ * @param[in]  output_shape                         Output spatial shape
+ */
+__kernel void reshape_layer(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output),
+                            int2 input_shape, int2 output_shape)
+{
+  Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+  int3 id = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+
+  // Linearize index
+  int linear_idx = id.x + id.y * input_shape.x + id.z * input_shape.x * input_shape.y;
+
+  // Translate to output
+  int3 out_id;
+  out_id.x = linear_idx % output_shape.x;
+  out_id.y = (linear_idx / output_shape.x) % output_shape.y;
+  out_id.z = linear_idx / (output_shape.x * output_shape.y);
+
+  // Store result
+  *((__global DATA_TYPE *)tensor3D_offset(&out, out_id.x, out_id.y, out_id.z)) =
+    *((__global DATA_TYPE *)in.ptr);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
index 45307fad7..987409739 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
@@ -39,16 +39,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
index ffa2c5a67..a5daa2410 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -43,6 +43,8 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/AccessWindowStatic.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
index 3f2ae357d..dc06bfbb3 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
@@ -41,13 +41,16 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 
 #include <cstddef>
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
index e4c617c8d..4206f1fd4 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -43,6 +43,9 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
index 8b5885225..62da2376e 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
@@ -45,6 +45,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 #include "arm_compute/core/UtilsEx.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
index f0a761b97..03ca6ddcb 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -43,6 +43,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
@@ -111,7 +112,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
   _hits = hits;
 
   // Make _lookup_indices tensor
-  _lookup_indices = support::cpp14::make_unique<CLTensor>();
+  _lookup_indices = std::make_unique<CLTensor>();
   _lookup_indices->allocator()->init(
     TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
   _lookup_indices->allocator()->allocate();
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
index dab6480b2..945af3c51 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
@@ -42,12 +42,16 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 #include "support/ToolchainSupport.h"
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp
new file mode 100644
index 000000000..a00fc5e2e
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+CLMemsetKernel::CLMemsetKernel() : ICLKernel(), _tensor(nullptr), _full_window() {}
+
+void CLMemsetKernel::configure(ICLTensor *tensor, const PixelValue &constant_value, Window *window)
+{
+  configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, window);
+}
+
+void CLMemsetKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor,
+                               const PixelValue &constant_value, Window *window)
+{
+  ARM_COMPUTE_UNUSED(compile_context);
+  ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+  ARM_COMPUTE_ERROR_THROW_ON(validate(tensor->info(), constant_value, window));
+
+  _tensor = tensor;
+
+  const DataType data_type = tensor->info()->data_type();
+  const int vec_size_x = 16 / tensor->info()->element_size();
+
+  // Create and update the window (if needed)
+  _full_window = calculate_max_window(*tensor->info());
+  Window win = _full_window;
+  if (window != nullptr)
+  {
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window);
+    win = *window;
+  }
+
+  const int output_width_x = win.num_iterations(0);
+  const bool multi_access_x = output_width_x >= vec_size_x;
+  const bool remainder_x = output_width_x % vec_size_x > 0;
+
+  if (multi_access_x)
+  {
+    win.set(
+      Window::DimX,
+      Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+  }
+  ICLKernel::configure_internal(win);
+
+  // Create kernel
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+  build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type));
+  build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+  build_opts.add_option_if(multi_access_x && remainder_x,
+                           "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+                                                    std::max<int>(output_width_x - vec_size_x, 0)));
+
+  _kernel =
+    static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("memset", build_opts.options()));
+}
+
+Status CLMemsetKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value,
+                                Window *window)
+{
+  ARM_COMPUTE_UNUSED(tensor);
+  ARM_COMPUTE_UNUSED(constant_value);
+  if (window != nullptr)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1);
+  }
+  return Status{};
+}
+
+void CLMemsetKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  // Collapse all the batches on the third
+  Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ);
+  Window slice = collapsed.first_slice_window_3D();
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _tensor, slice);
+    enqueue(queue, *this, slice, lws_hint());
+  } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
index 1d4b141a7..da7437e97 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
@@ -40,15 +40,19 @@
 
 #include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
index ee633d437..cd5e571e9 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -43,6 +43,9 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
index 0b8e7cc41..4c4cbe710 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
@@ -42,6 +42,10 @@
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 #include <string>
 namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp
new file mode 100644
index 000000000..b6efeac35
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h"
+
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_UNUSED(constant_value);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
+  ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > input->num_dimensions());
+  if (mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3);
+
+    const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
+    for (size_t i = 0; i < padding.size(); ++i)
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect));
+      ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect));
+    }
+  }
+
+  if (output->total_size() > 0)
+  {
+    TensorShape padded_shape =
+      misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
+  }
+
+  return Status{};
+}
+
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PaddingList &padding,
+                              PixelValue constant_value, PaddingMode mode,
+                              unsigned int &num_elems_processed_per_iteration)
+{
+  ARM_COMPUTE_UNUSED(constant_value, mode);
+
+  const TensorShape padded_shape =
+    misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
+  auto_init_if_empty(*output, input->clone()->set_tensor_shape(padded_shape));
+
+  num_elems_processed_per_iteration =
+    std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->data_type())));
+  if (input->dimension(0) < num_elems_processed_per_iteration)
+  {
+    num_elems_processed_per_iteration =
+      1 << static_cast<unsigned int>(std::log2(input->dimension(0)));
+  }
+
+  // Configure kernel window
+  Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+  const int input_start_x =
+    mode == PaddingMode::CONSTANT ? -(padding.at(0).first % num_elems_processed_per_iteration) : 0;
+  const int input_start_y =
+    (mode == PaddingMode::CONSTANT && padding.size() > 1) ? -padding.at(1).first : 0;
+
+  AccessWindowRectangle input_access(input, input_start_x, input_start_y,
+                                     num_elems_processed_per_iteration, 1);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  const bool window_changed = update_window_and_padding(win, input_access, output_access);
+  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+  Status err = (window_changed)
+                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                 : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLPadLayerKernelEx::CLPadLayerKernelEx()
+  : _input(nullptr), _output(nullptr), _input_start_x(0), _input_start_y(0), _4d_enabled(false)
+{
+}
+
+void CLPadLayerKernelEx::configure(const ICLTensor *input, ICLTensor *output,
+                                   const PaddingList &padding, PixelValue constant_value,
+                                   PaddingMode mode)
+{
+  configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value,
+            mode);
+}
+
+void CLPadLayerKernelEx::configure(const CLCompileContext &compile_context, const ICLTensor *input,
+                                   ICLTensor *output, const PaddingList &padding,
+                                   PixelValue constant_value, PaddingMode mode)
+{
+  ARM_COMPUTE_UNUSED(compile_context);
+  // Perform validation step
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+    validate_arguments(input->info(), output->info(), padding, constant_value, mode));
+
+  _input = input;
+  _output = output;
+  _4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3);
+
+  // Configure window
+  unsigned int vec_size;
+  auto win_config = validate_and_configure_window(input->info(), output->info(), padding,
+                                                  constant_value, mode, vec_size);
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+
+  // Set build options
+  std::string kernel_name = "pad_layer_";
+
+  const DataType &data_type = input->info()->data_type();
+  const unsigned int input_width = input->info()->dimension(0);
+  const unsigned int input_height = input->info()->dimension(1);
+  const unsigned int input_depth = input->info()->dimension(2);
+  const unsigned int pad_x_before = padding.at(0).first;
+  const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0;
+  const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0;
+  const unsigned int pad_right_start = input_width + pad_x_before;
+
+  _input_start_x = mode == PaddingMode::CONSTANT ? -(pad_x_before % vec_size) : 0;
+  _input_start_y = (mode == PaddingMode::CONSTANT && padding.size() > 1) ? -padding.at(1).first : 0;
+
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+  build_opts.add_option("-DSELECT_DT=" + get_cl_select_type_from_data_type(data_type));
+  build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+  build_opts.add_option("-DPAD_X_BEFORE=" + support::cpp11::to_string(pad_x_before));
+  build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
+  if (padding.size() > 1)
+  {
+    build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before));
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
+
+    if (padding.size() > 2)
+    {
+      build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before));
+      build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth));
+    }
+  }
+
+  switch (mode)
+  {
+    case PaddingMode::CONSTANT:
+    {
+      kernel_name += "constant";
+
+      build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type));
+      build_opts.add_option_if(pad_x_before >= vec_size,
+                               "-DNUM_THREADS_TO_SKIP_X=" +
+                                 support::cpp11::to_string(pad_x_before / vec_size));
+
+      if (_4d_enabled)
+      {
+        build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first));
+        build_opts.add_option("-DSRC_BATCH=" +
+                              support::cpp11::to_string(input->info()->dimension(3)));
+      }
+
+      break;
+    }
+    case PaddingMode::SYMMETRIC:
+    case PaddingMode::REFLECT:
+    {
+      kernel_name += "symmetric_reflect";
+
+      const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
+
+      const unsigned int pad_x_before_remainder = pad_x_before % vec_size;
+      const unsigned int pad_x_after_remainder = pad_right_start % vec_size;
+      const unsigned int after_pad_fact_x = (2 * input_width + pad_x_before) - is_reflect;
+      const unsigned int output_last_x =
+        ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
+
+      build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect));
+      build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" +
+                            support::cpp11::to_string(pad_x_before_remainder));
+      build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" +
+                            support::cpp11::to_string(pad_x_after_remainder));
+      build_opts.add_option(
+        "-DPAD_X_BEFORE_REMAINDER_REFL=" +
+        support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
+      build_opts.add_option(
+        "-DPAD_X_AFTER_REMAINDER_REFL=" +
+        support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
+      build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x));
+      build_opts.add_option_if(after_pad_fact_x < output_last_x,
+                               "-DAFTER_PAD_REM=" +
+                                 support::cpp11::to_string(after_pad_fact_x % vec_size));
+
+      break;
+    }
+    default:
+      ARM_COMPUTE_ERROR("Padding mode not supported.");
+  }
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+    CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
+}
+
+Status CLPadLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                    const PaddingList &padding, PixelValue constant_value,
+                                    PaddingMode mode)
+{
+  unsigned int vec_size;
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                            output->clone().get(), padding,
+                                                            constant_value, mode, vec_size)
+                                .first);
+
+  return Status{};
+}
+
+void CLPadLayerKernelEx::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  Window win_in = window;
+  win_in.adjust(Window::DimX, _input_start_x, true);
+  win_in.adjust(Window::DimY, _input_start_y, true);
+
+  Window slice_out = window.first_slice_window_3D();
+  Window slice_in = win_in.first_slice_window_3D();
+  unsigned int batch = 0;
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice_in);
+    add_3D_tensor_argument(idx, _output, slice_out);
+    if (_4d_enabled)
+    {
+      add_argument<unsigned int>(idx, batch++);
+    }
+
+    enqueue(queue, *this, slice_out, lws_hint());
+  } while (window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
index b417a7103..9aa815f55 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
@@ -40,15 +40,19 @@
 
 #include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
index 3906009c2..70374ba61 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -43,6 +43,9 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
index 4a6374444..c9d6dc31c 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
@@ -40,7 +40,7 @@
 
 #include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
+#include "src/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -48,6 +48,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 
 #include <climits>
diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
index c88bef6d7..1d4d33ac2 100644
--- a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
@@ -42,7 +42,7 @@
 
 #include <algorithm>
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEAsymm.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
index a8464afce..0551fc7db 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
@@ -43,10 +43,10 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <algorithm>
 #include <arm_neon.h>
@@ -163,7 +163,7 @@ void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out,
 
 std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func(
   const ITensor *input1, const ITensor *input2, ITensor *output,
-  std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function)
+  std::map<std::string, cpu::kernels::CpuElementwiseKernel::ElementwiseFunction *> map_function)
 {
   std::string function_to_call("op_");
   function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
@@ -185,9 +185,9 @@ template <BinaryLogicalOperation op>
 std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
 configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
-  static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = {
-    {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>},
-    {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}};
+  static std::map<std::string, cpu::kernels::CpuElementwiseKernel::ElementwiseFunction *>
+    map_function = {{"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>},
+                    {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}};
 
   return configure_func(input1, input2, output, map_function);
 }
@@ -196,7 +196,7 @@ void NEBinaryLogicalOperationKernel::configure(BinaryLogicalOperation op, const
                                                const ITensor *input2, ITensor *output)
 {
   ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
-  configure_common(input1, input2, output);
+  configure_common(input1->info(), input2->info(), output->info());
   switch (op)
   {
     case BinaryLogicalOperation::AND:
@@ -251,5 +251,4 @@ Status NEBinaryLogicalOperationKernel::validate(BinaryLogicalOperation op,
   ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
   return Status{};
 }
-
 } // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
index f935596e6..87e716b4f 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
@@ -39,16 +39,19 @@
  */
 #include "arm_compute/core/NEON/kernels/NECastBoolKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEMath.h"
+#include "src/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/SaturateCast.h"
+#include "support/SaturateCast.h"
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
+#include "src/core/NEON/INEKernel.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
index e3a77c6b1..3ad9ee945 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
@@ -47,6 +47,9 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 using namespace arm_compute;
 
 NEEmbeddingLookupKernel::NEEmbeddingLookupKernel()
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
new file mode 100644
index 000000000..375fa28e5
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <mutex>
+
+using namespace arm_compute;
+
+namespace
+{
+inline Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(accum);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+  ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != accum->dimension(0));
+
+  return Status{};
+}
+
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *accum,
+                                                               ITensorInfo *biases)
+{
+  constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+  // Configure kernel window
+  Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
+
+  bool window_changed = update_window_and_padding(
+    win, AccessWindowHorizontal(accum, 0, num_elems_processed_per_iteration),
+    AccessWindowStatic(biases, 0, 0,
+                       ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration),
+                       biases->tensor_shape().y()));
+
+  AccessWindowHorizontal output_access(accum, 0, num_elems_processed_per_iteration);
+
+  // Set the valid region for the accum tensor
+  Coordinates coord;
+  coord.set_num_dimensions(accum->num_dimensions());
+  output_access.set_valid_region(win, ValidRegion(coord, accum->tensor_shape()));
+
+  Status err = (window_changed)
+                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                 : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+NEGEMMMatrixAccumulateBiasesKernel::NEGEMMMatrixAccumulateBiasesKernel()
+  : _accum(nullptr), _biases(nullptr)
+{
+}
+
+void NEGEMMMatrixAccumulateBiasesKernel::configure(ITensor *accum, const ITensor *biases)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases);
+
+  // Perform validate step
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info()));
+
+  _biases = biases;
+  _accum = accum;
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(accum->info(), biases->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  INEKernel::configure(win_config.second);
+}
+
+Status NEGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum,
+                                                    const ITensorInfo *biases)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+    validate_and_configure_window(accum->clone().get(), biases->clone().get()).first);
+
+  return Status{};
+}
+
+std::mutex m;
+void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window, const ThreadInfo &info)
+{
+  std::lock_guard<std::mutex> lock_guard(m);
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  Window win_biases;
+  win_biases.set(Window::DimX,
+                 Window::Dimension(window.x().start(), window.x().end(), window.x().step()));
+  win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+  Iterator in0_out(_accum, window);
+  Iterator in1(_biases, win_biases);
+
+  switch (_accum->info()->data_type())
+  {
+    case DataType::F32:
+    {
+      execute_window_loop(
+        window,
+        [&](const Coordinates &) {
+          const float32x4x4_t accum = vld4q_f32(reinterpret_cast<const float *>(in0_out.ptr()));
+          const float32x4x4_t biases = vld4q_f32(reinterpret_cast<const float *>(in1.ptr()));
+          const float32x4x4_t res = {
+            {vaddq_f32(accum.val[0], biases.val[0]), vaddq_f32(accum.val[1], biases.val[1]),
+             vaddq_f32(accum.val[2], biases.val[2]), vaddq_f32(accum.val[3], biases.val[3])}};
+
+          vst4q_f32(reinterpret_cast<float *>(in0_out.ptr()), res);
+        },
+        in0_out, in1);
+      break;
+    }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    case DataType::F16:
+    {
+      execute_window_loop(
+        window,
+        [&](const Coordinates &) {
+          const float16x8x2_t accum = vld2q_f16(reinterpret_cast<const float16_t *>(in0_out.ptr()));
+          const float16x8x2_t biases = vld2q_f16(reinterpret_cast<const float16_t *>(in1.ptr()));
+          const float16x8x2_t res = {
+            {vaddq_f16(accum.val[0], biases.val[0]), vaddq_f16(accum.val[1], biases.val[1])}};
+
+          vst2q_f16(reinterpret_cast<float16_t *>(in0_out.ptr()), res);
+        },
+        in0_out, in1);
+      break;
+    }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    default:
+      ARM_COMPUTE_ERROR("Data type not supported");
+      break;
+  }
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
index c9f0799d4..d4144e6b9 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
@@ -40,7 +40,7 @@
 
 #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -50,6 +50,9 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 namespace arm_compute
 {
 namespace
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
index 52b40e767..f178865b7 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
@@ -47,6 +47,9 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include <unordered_map>
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
index 4dc0f5535..7804f9c6a 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
@@ -40,17 +40,22 @@
 
 #include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
index ad4728175..8ad998313 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
@@ -42,13 +42,15 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
index 0daff5c6a..e56fbf7f3 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
@@ -38,7 +38,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/NEON/kernels/NEOneHotKernel.h"
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -47,6 +47,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 namespace arm_compute
 {
 namespace
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
index 2306228d5..420e5063c 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
@@ -42,13 +42,16 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
index b02a48ef2..6b9b0d4b4 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
@@ -45,7 +45,9 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/Utils.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/runtime/Utils.h"
 
 namespace arm_compute
 {
@@ -66,7 +68,7 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT
                                   "Reduction axis greater than max number of dimensions");
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
   const unsigned int num_of_stages =
-    calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
+    utils::calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
 
   DataType output_data_type = DataType::S32;
   TensorInfo not_reshaped_output;
@@ -132,7 +134,7 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT
     ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(
       input, &sums_vector[last_stage - 1], &not_reshaped_output, axis, op));
   }
-  ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&not_reshaped_output, output));
+  ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&not_reshaped_output, output));
   return Status{};
 }
 
@@ -140,7 +142,7 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *
                                    const ReductionOperation &op)
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  _num_of_stages = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
+  _num_of_stages = utils::calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
   _reduction_axis = axis;
 
   const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(
@@ -204,7 +206,8 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *
                                                     &_not_reshaped_output, axis, op);
     _results_vector[last_stage - 1].allocator()->allocate();
   }
-  _reshape_kernel.configure(&_not_reshaped_output, output);
+  _reshape_kernel.configure(CLKernelLibrary::get().get_compile_context(), &_not_reshaped_output,
+                            output);
   _not_reshaped_output.allocator()->allocate();
 }
 
@@ -216,6 +219,6 @@ void CLArgMinMaxLayerEx::run()
   {
     CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
   }
-  CLScheduler::get().enqueue(_reshape_kernel, false);
+  _reshape_kernel.run();
 }
 } // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
index e5122ab8f..31c96b080 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
@@ -42,13 +42,14 @@
 
 #include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 using namespace arm_compute;
 
 void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
                                   BinaryLogicalOperation op)
 {
-  auto k = support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+  auto k = std::make_unique<CLBinaryLogicalOpKernel>();
   k->configure(input1, input2, output, op);
   _kernel = std::move(k);
 
@@ -57,7 +58,7 @@ void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTenso
     ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
     if (broadcasted_info->info()->dimension(0) == 1)
     {
-      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+      _border_handler->configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
     }
   }
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp
index c7d0ac8e2..96f9c17a9 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp
@@ -46,7 +46,7 @@ using namespace arm_compute;
 
 void CLCastBool::configure(ICLTensor *input, ICLTensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLCastBoolKernel>();
+  auto k = std::make_unique<CLCastBoolKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
index 6359b4bcb..464f60dee 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
@@ -45,6 +45,8 @@
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include <memory>
 #include <tuple>
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
index ae9d8afc6..003ec8042 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
@@ -39,7 +39,6 @@
  */
 
 #include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h"
-
 #include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
 
 using namespace arm_compute;
@@ -47,7 +46,7 @@ using namespace arm_compute;
 void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
                                   const ICLTensor *lookups)
 {
-  auto k = support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+  auto k = std::make_unique<CLEmbeddingLookupKernel>();
   k->configure(input, output, lookups);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
index 79d0929a9..af936e873 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
@@ -45,7 +45,6 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
 
 #include <algorithm>
 
@@ -68,7 +67,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
 void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
 {
-  auto k = support::cpp14::make_unique<CLTransposeKernel>();
+  auto k = std::make_unique<CLTransposeKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
index 13d3acbac..c6a88d340 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
@@ -42,11 +42,11 @@
 
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
+
+#include "support/Cast.h"
 
 #include <algorithm>
 
@@ -141,7 +141,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
 void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output)
 {
-  auto k = support::cpp14::make_unique<CLTransposeKernel>();
+  auto k = std::make_unique<CLTransposeKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
index ac6982e6f..cda784541 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
@@ -19,6 +19,7 @@
 #include <arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h>
 #include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
 #include <arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h>
+#include "src/core/helpers/AutoConfiguration.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp
new file mode 100644
index 000000000..cd7409417
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "support/StringSupport.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(accum);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+  ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() != 1);
+
+  return Status{};
+}
+
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *accum, ITensorInfo *biases, GPUTarget gpu_target,
+                              unsigned int &num_elems_processed_per_iteration)
+{
+  // Select the vector size to use (8 for Bifrost; 16 for Midgard).
+  bool is_gpu_bifrost =
+    gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, GPUTarget::G51,
+                     GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G52, GPUTarget::G52LIT);
+  num_elems_processed_per_iteration = is_gpu_bifrost ? 8 : 16;
+
+  // Configure kernel window
+  Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
+
+  AccessWindowStatic biases_access(
+    biases, 0, 0, ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration),
+    biases->dimension(1));
+  AccessWindowHorizontal accum_access(accum, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win, biases_access, accum_access);
+
+  Status err = (window_changed)
+                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                 : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel()
+  : _accum(nullptr), _biases(nullptr)
+{
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases)
+{
+  configure(CLKernelLibrary::get().get_compile_context(), accum, biases);
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::configure(const CLCompileContext &compile_context,
+                                                   ICLTensor *accum, const ICLTensor *biases)
+{
+  ARM_COMPUTE_UNUSED(compile_context);
+  // Perform validate step
+  ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info()));
+
+  _biases = biases;
+  _accum = accum;
+
+  // Get the target gpu
+  GPUTarget gpu_target = get_target();
+  unsigned int vector_size = 0;
+
+  // Configure kernel window
+  auto win_config =
+    validate_and_configure_window(accum->info(), biases->info(), gpu_target, vector_size);
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+
+  // Add build options
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(accum->info()->data_type()));
+  build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+    CLKernelLibraryEx::get().create_kernel("gemm_accumulate_biases", build_opts.options()));
+}
+
+Status CLGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum,
+                                                    const ITensorInfo *biases, GPUTarget gpu_target)
+{
+  unsigned int num_elems_processed_per_iteration = 0;
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(accum->clone().get(),
+                                                            biases->clone().get(), gpu_target,
+                                                            num_elems_processed_per_iteration)
+                                .first);
+
+  return Status{};
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+  Window accum_slice = window.first_slice_window_2D();
+
+  Window biases_slice(accum_slice);
+  biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+  // Run kernel
+  do
+  {
+    // Set arguments
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _accum, accum_slice);
+    add_1D_tensor_argument(idx, _biases, biases_slice);
+
+    enqueue(queue, *this, accum_slice, lws_hint());
+  } while (window.slide_window_slice_2D(accum_slice));
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
index e0b833b04..f380e3e2c 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
@@ -41,6 +41,8 @@
 #include "arm_compute/runtime/CL/functions/CLGatherEx.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/CL/kernels/CLGatherKernel.h"
+
 #include "arm_compute/core/CL/kernels/CLGatherExKernel.h"
 
 using namespace arm_compute;
@@ -48,7 +50,7 @@ using namespace arm_compute;
 void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output,
                            int axis)
 {
-  auto k = support::cpp14::make_unique<CLGatherExKernel>();
+  auto k = std::make_unique<CLGatherExKernel>();
   k->configure(input, indices, output, axis);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
index 65b89a389..9896abd4b 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
@@ -47,7 +47,7 @@ using namespace arm_compute;
 void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
                                   const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
 {
-  auto k = support::cpp14::make_unique<CLHashtableLookupKernel>();
+  auto k = std::make_unique<CLHashtableLookupKernel>();
   k->configure(lookups, keys, input, output, hits);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
index 5a7e40839..ca45a57f8 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
@@ -50,7 +50,7 @@ CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {}
 void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
                                                ICLTensor *gamma, ICLTensor *beta, float epsilon)
 {
-  auto k = support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
+  auto k = std::make_unique<CLInstanceNormalizationLayerKernelEx>();
   k->configure(input, output, gamma, beta, epsilon);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
index 28e5bc0da..2bdc451b3 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
@@ -46,7 +46,7 @@ using namespace arm_compute;
 
 void CLNeg::configure(ICLTensor *input, ICLTensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>();
+  auto k = std::make_unique<CLNegKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp
index aa9f32ec6..759a19ff3 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp
@@ -41,7 +41,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 CLOneHot::CLOneHot() : _memset_kernel(), _onehot_kernel(), _has_to_memset(false) {}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp
new file mode 100644
index 000000000..4d940e966
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLPadLayerEx.h"
+#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h"
+
+namespace arm_compute
+{
+CLPadLayerEx::CLPadLayerEx()
+  : _pad_kernel(std::make_unique<CLPadLayerKernelEx>()),
+    _copy_kernel(std::make_unique<opencl::kernels::ClCopyKernel>()), _perform_pad(false)
+{
+}
+
+void CLPadLayerEx::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding,
+                             PixelValue constant_value, PaddingMode mode)
+{
+  configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value,
+            mode);
+}
+
+void CLPadLayerEx::configure(const CLCompileContext &compile_context, ICLTensor *input,
+                             ICLTensor *output, const PaddingList &padding,
+                             PixelValue constant_value, PaddingMode mode)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(
+    validate(input->info(), output->info(), padding, constant_value, mode));
+
+  _perform_pad = std::any_of(padding.begin(), padding.end(),
+                             [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
+
+  if (_perform_pad)
+  {
+    _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode);
+  }
+  else
+  {
+    Window copy_window = Window();
+    copy_window.use_tensor_dimensions(output->info()->tensor_shape());
+    // Copy the input to the whole output if no padding is applied
+    _copy_kernel->configure(compile_context, input->info(), output->info(), &copy_window);
+  }
+}
+Status CLPadLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                              const PaddingList &padding, PixelValue constant_value,
+                              PaddingMode mode)
+{
+  bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) {
+    return info.first > 0 || info.second > 0;
+  });
+
+  if (perform_pad)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+      CLPadLayerKernelEx::validate(input, output, padding, constant_value, mode));
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClCopyKernel::validate(input, output));
+  }
+  return Status{};
+}
+void CLPadLayerEx::run()
+{
+  if (_perform_pad)
+  {
+    CLScheduler::get().enqueue(*_pad_kernel);
+  }
+  else
+  {
+    CLScheduler::get().enqueue(*_copy_kernel);
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
index c246041bb..6740835a8 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -61,7 +61,7 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *
   ARM_COMPUTE_RETURN_ERROR_ON(num_of_kernels < 1);
 
   // Create temporary tensor infos
-  auto interm_tensors = support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+  auto interm_tensors = std::make_unique<TensorInfo[]>(num_of_interm_tensors);
 
   // Create intermediate tensor info
   TensorShape shape{input->tensor_shape()};
@@ -124,8 +124,8 @@ void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
     throw std::runtime_error("CLReduceOperation: there is no axis to reduce");
   }
 
-  _interm_tensors = support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
-  _reduce_kernels = support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+  _interm_tensors = std::make_unique<CLTensor[]>(num_of_interm_tensors);
+  _reduce_kernels = std::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
 
   // Set a vector that is ordered ICLTensors sequentially.
   std::vector<ICLTensor *> tensors;
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
index 12c0aa829..73f5f6eb1 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
@@ -47,6 +47,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include <cassert>
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
index 0754fd813..f3f093c18 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
@@ -79,7 +79,7 @@ void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, IC
   {
     case DeconvolutionMethod::DIRECT:
     {
-      auto f = arm_compute::support::cpp14::make_unique<CLDirectTransposeConvLayer>();
+      auto f = std::make_unique<CLDirectTransposeConvLayer>();
       f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right,
                    invalid_bottom, weights_info);
       _function = std::move(f);
@@ -87,7 +87,7 @@ void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, IC
     }
     case DeconvolutionMethod::GEMM:
     {
-      auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
+      auto f = std::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
       f->configure(compile_context, input, weights, bias, output, deconv_info);
       _function = std::move(f);
       break;
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
index 2fc94b267..e6b7329d1 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
@@ -38,11 +38,10 @@
  * SOFTWARE.
  */
 
-#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h"
 #include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h>
+#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "support/MemorySupport.h"
 
 #include <utility>
 
@@ -53,7 +52,7 @@ template <BinaryLogicalOperation COP>
 void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2,
                                                     ITensor *output)
 {
-  auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+  auto k = std::make_unique<NEBinaryLogicalOperationKernel>();
   k->configure(COP, input1, input2, output);
   _kernel = std::move(k);
 }
@@ -69,7 +68,7 @@ Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1,
 void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output,
                                          BinaryLogicalOperation op)
 {
-  auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+  auto k = std::make_unique<NEBinaryLogicalOperationKernel>();
   k->configure(op, input1, input2, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp
index 6ad3e1b12..f6eec2603 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp
@@ -40,13 +40,12 @@
 #include "arm_compute/runtime/NEON/functions/NECastBool.h"
 
 #include "arm_compute/core/NEON/kernels/NECastBoolKernel.h"
-#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 void NECastBool::configure(const ITensor *input, ITensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NECastBoolKernel>();
+  auto k = std::make_unique<NECastBoolKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
index e0ab3e025..99fc5c579 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
@@ -41,13 +41,12 @@
 #include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h"
 
 #include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
-#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups)
 {
-  auto k = support::cpp14::make_unique<NEEmbeddingLookupKernel>();
+  auto k = std::make_unique<NEEmbeddingLookupKernel>();
   k->configure(input, output, lookups);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
index e212a03c7..fbd88fff0 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
@@ -66,7 +66,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
 void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
 {
-  auto k = support::cpp14::make_unique<NETransposeKernel>();
+  auto k = std::make_unique<NETransposeKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
index a639f2979..758f7dc59 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
@@ -50,7 +50,8 @@
 #include <algorithm>
 #include <cmath>
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
 
 namespace
@@ -164,9 +165,8 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei
                                         const ITensor *biases, ITensor *output,
                                         FullyConnectedLayerInfo fc_info)
 {
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
   // Perform validate step
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
   ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate(
     input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
     fc_info));
@@ -348,7 +348,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
        (input->dimension(0) * input->dimension(1) * input->dimension(2))));
 
     // Validate flatten kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayer::validate(input, &flatten_input));
     input_to_use = &flatten_input;
   }
   else
@@ -374,9 +374,13 @@ void NEFullyConnectedLayerEx::run()
   if (!_is_prepared)
   {
     if (!_are_weights_reshaped)
+    {
       _reshape_weights_output.allocator()->allocate();
+    }
     if (!_are_weights_converted)
+    {
       _converted_weights_output.allocator()->allocate();
+    }
     _is_prepared = true;
   }
 
@@ -407,7 +411,7 @@ void NEFullyConnectedLayerEx::run()
   // Linearize input if it comes from a convolutional layer
   if (_is_fc_after_conv)
   {
-    NEScheduler::get().schedule(&_flatten_kernel, Window::DimY);
+    _flatten_kernel.run();
   }
 
   // Run matrix multiply
@@ -490,3 +494,4 @@ void NEFullyConnectedLayerEx::prepare()
   }
 #endif
 }
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
index 234c783f9..2199839fb 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
@@ -19,6 +19,8 @@
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h>
+#include "src/core/helpers/AutoConfiguration.h"
+#include <cassert>
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
index 433c35d58..e5607ab9a 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
@@ -41,7 +41,6 @@
 #include "arm_compute/runtime/NEON/functions/NEGatherEx.h"
 
 #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
-#include "support/MemorySupport.h"
 
 #include <utility>
 
@@ -49,7 +48,7 @@ namespace arm_compute
 {
 void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
 {
-  auto k = support::cpp14::make_unique<NEGatherKernelEx>();
+  auto k = std::make_unique<NEGatherKernelEx>();
   k->configure(input, indices, output, axis);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
index 52d58accf..7cc6c89e7 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
@@ -41,14 +41,13 @@
 #include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h"
 
 #include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
-#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input,
                                   ITensor *output, ITensor *hits)
 {
-  auto k = support::cpp14::make_unique<NEHashtableLookupKernel>();
+  auto k = std::make_unique<NEHashtableLookupKernel>();
   k->configure(lookups, keys, input, output, hits);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp
index 275c55024..e0620bad2 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp
@@ -39,14 +39,14 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEOneHot.h"
 #include "arm_compute/core/NEON/kernels/NEOneHotKernel.h"
-#include "support/MemorySupport.h"
+
 #include <utility>
 namespace arm_compute
 {
 void NEOneHot::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
                          const ITensor *off_value, ITensor *output, int axis)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NEOneHotKernel>();
+  auto k = std::make_unique<NEOneHotKernel>();
   k->configure(indices, depth, on_value, off_value, output, axis);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
index c45c335b3..a30c00ea1 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
@@ -40,11 +40,13 @@
 
 #include "arm_compute/runtime/NEON/functions/NEReduceOperation.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/Tensor.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
index b21717e86..7a1342644 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
@@ -40,9 +40,13 @@
 
 #include "arm_compute/runtime/NEON/functions/NEReduceSum.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
index 50311071b..4675121b2 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
@@ -44,6 +44,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 using namespace arm_compute::misc::shape_calculator;
 
diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h
index 10f3ecbd3..c5dd63b5b 100644
--- a/compute/cker/include/cker/Types.h
+++ b/compute/cker/include/cker/Types.h
@@ -111,6 +111,8 @@ struct SoftmaxParams
   int32_t zero_point;
   float scale;
   float *table;
+  uint8_t *uint8_table1;
+  uint8_t *uint8_table2;
 };
 
 struct PackParams
diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h
index f73c01523..9aae0a957 100644
--- a/compute/cker/include/cker/Utils.h
+++ b/compute/cker/include/cker/Utils.h
@@ -20,6 +20,8 @@
 
 #include "Shape.h"
 
+#include "neon/neon_check.h"
+
 #include <algorithm>
 #include <cstdint>
 #include <fixedpoint/fixedpoint.h>
@@ -29,6 +31,11 @@ namespace nnfw
 namespace cker
 {
 
+template <typename T> struct is_quant8
+{
+  static constexpr bool value = std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value;
+};
+
 template <typename T>
 inline T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
 {
@@ -106,6 +113,34 @@ inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x,
     gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
 }
 
+#ifdef USE_NEON
+inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(int32x4x4_t input_val,
+                                                      int32_t quantized_multiplier, int32_t shift)
+{
+  const int left_shift = std::max(shift, 0);
+  const int right_shift = std::min(shift, 0);
+  int32x4x4_t result;
+
+  int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
+  int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
+  int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
+
+  result.val[0] = vrshlq_s32(
+    vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup), multiplier_dup), right_shift_dup);
+
+  result.val[1] = vrshlq_s32(
+    vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup), multiplier_dup), right_shift_dup);
+
+  result.val[2] = vrshlq_s32(
+    vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup), multiplier_dup), right_shift_dup);
+
+  result.val[3] = vrshlq_s32(
+    vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup), multiplier_dup), right_shift_dup);
+
+  return result;
+}
+#endif
+
 inline int NodeOffset(int b, int h, int w, int height, int width)
 {
   return (b * height + h) * width + w;
diff --git a/compute/cker/include/cker/operation/AveragePool.h b/compute/cker/include/cker/operation/AveragePool.h
index a70e39cc9..e10f02ad4 100644
--- a/compute/cker/include/cker/operation/AveragePool.h
+++ b/compute/cker/include/cker/operation/AveragePool.h
@@ -395,6 +395,129 @@ void AveragePool<uint8_t>(const PoolParams &params, const Shape &input_shape,
   }
 }
 
+template <>
+void AveragePool<int8_t>(const PoolParams &params, const Shape &input_shape,
+                         const int8_t *input_data, const Shape &output_shape, int8_t *output_data)
+{
+  // Here, and in other pooling ops, in order to maintain locality of reference,
+  // to minimize some recalculations, and to load into NEON vector registers, we
+  // use an inner loop down the depth. Since depths can be large and hence we
+  // would need arbitrarily large temporary storage, we divide the work up into
+  // depth tranches just within the batch loop.
+  static constexpr int kPoolingAccTrancheSize = 256;
+
+  assert(params.quantized_activation_min <= params.quantized_activation_max);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  int32_t acc[kPoolingAccTrancheSize];
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    // We proceed through the depth in tranches (see comment above). The
+    // depth_base is the depth at the beginning of the tranche. The
+    // tranche_depth is the depth dimension of the tranche.
+    for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize)
+    {
+      const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize);
+      for (int out_y = 0; out_y < output_height; ++out_y)
+      {
+        for (int out_x = 0; out_x < output_width; ++out_x)
+        {
+          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+          const int filter_count =
+            (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+          memset(acc, 0, tranche_depth * sizeof(acc[0]));
+          const int8_t *input_ptr =
+            input_data + depth_base +
+            depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+          for (int fy = filter_y_start; fy < filter_y_end; fy++)
+          {
+            const int8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
+            for (int fx = filter_x_start; fx < filter_x_end; fx++)
+            {
+              const int8_t *input_channel_ptr = input_row_ptr;
+              int channel = 0;
+#ifdef USE_NEON
+              for (; channel <= tranche_depth - 16; channel += 16)
+              {
+                int16x4_t acc_reg[4];
+                int8x16_t input_reg = vld1q_s8(input_channel_ptr);
+                input_channel_ptr += 16;
+                acc_reg[0] = vget_low_s16(vmovl_s8(vget_low_s8(input_reg)));
+                acc_reg[1] = vget_high_s16(vmovl_s8(vget_low_s8(input_reg)));
+                acc_reg[2] = vget_low_s16(vmovl_s8(vget_high_s8(input_reg)));
+                acc_reg[3] = vget_high_s16(vmovl_s8(vget_high_s8(input_reg)));
+                for (int i = 0; i < 4; i++)
+                {
+                  vst1q_s32(acc + channel + 4 * i,
+                            vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
+                }
+              }
+              for (; channel <= tranche_depth - 8; channel += 8)
+              {
+                int16x4_t acc_reg[2];
+                int16x8_t input_reg = vmovl_s8(vld1_s8(input_channel_ptr));
+                input_channel_ptr += 8;
+                acc_reg[0] = vget_low_s16(input_reg);
+                acc_reg[1] = vget_high_s16(input_reg);
+                for (int i = 0; i < 2; i++)
+                {
+                  vst1q_s32(acc + channel + 4 * i,
+                            vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
+                }
+              }
+#endif
+              for (; channel < tranche_depth; ++channel)
+              {
+                acc[channel] += *input_channel_ptr++;
+              }
+              input_row_ptr += depth;
+            }
+          }
+          int8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base);
+          int channel = 0;
+#ifdef USE_NEON
+          for (; channel <= tranche_depth - 8; channel += 8)
+          {
+            int16_t buf[8];
+            for (int i = 0; i < 8; i++)
+            {
+              buf[i] = acc[channel + i] > 0 ? (acc[channel + i] + filter_count / 2) / filter_count
+                                            : (acc[channel + i] - filter_count / 2) / filter_count;
+            }
+            int8x8_t buf8 = vqmovn_s16(vld1q_s16(buf));
+            buf8 = vmin_s8(buf8, vdup_n_s8(params.quantized_activation_max));
+            buf8 = vmax_s8(buf8, vdup_n_s8(params.quantized_activation_min));
+            vst1_s8(output_ptr + channel, buf8);
+          }
+#endif
+          for (; channel < tranche_depth; ++channel)
+          {
+            int16_t a = acc[channel] > 0 ? (acc[channel] + filter_count / 2) / filter_count
+                                         : (acc[channel] - filter_count / 2) / filter_count;
+            a = std::max<int16_t>(a, params.quantized_activation_min);
+            a = std::min<int16_t>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<int8_t>(a);
+          }
+        }
+      }
+    }
+  }
+}
+
 } // namespace cker
 } // namespace nnfw
 
diff --git a/compute/cker/include/cker/operation/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
index fe5f87746..c7878496a 100644
--- a/compute/cker/include/cker/operation/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
@@ -190,34 +190,34 @@ inline bool ProcessBroadcastShapes(const Shape &shape0, const Shape &shape1,
 }
 
 template <BinaryArithmeticOpType op_type, typename T>
-inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                               const T *input1_data, const Shape &input2_shape,
-                               const T *input2_data, const Shape &output_shape, T *output_data)
+inline typename std::enable_if_t<!is_quant8<T>::value>
+BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                   const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                   const Shape &output_shape, T *output_data)
 {
   reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data,
                                 output_shape, output_data, GetBinaryArtithmeticFn<op_type, T>());
 }
 
-template <BinaryArithmeticOpType op_type>
-inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                               const uint8_t *input1_data, const Shape &input2_shape,
-                               const uint8_t *input2_data, const Shape &output_shape,
-                               uint8_t *output_data)
+template <BinaryArithmeticOpType op_type, typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                   const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                   const Shape &output_shape, T *output_data)
 {
   switch (op_type)
   {
     case nnfw::cker::BinaryArithmeticOpType::ADD:
     case nnfw::cker::BinaryArithmeticOpType::SUB:
-      optimized::AddQuant8(params, input1_shape, input1_data, input2_shape, input2_data,
-                           output_shape, output_data);
+      optimized::Add(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                     output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::MUL:
-      optimized::MulQuant8(params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape,
-                           const_cast<uint8_t *>(input2_data), output_shape, output_data);
+      optimized::Mul(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                     output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::DIV:
       throw std::runtime_error{"Quant8 Asymm NYI"};
-
     default:
       assert(false);
       break;
@@ -256,33 +256,32 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shap
 }
 
 template <BinaryArithmeticOpType op_type, typename T>
-inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                                        const T *input1_data, const Shape &input2_shape,
-                                        const T *input2_data, const Shape &output_shape,
-                                        T *output_data)
+inline typename std::enable_if_t<!is_quant8<T>::value>
+BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                            const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                            const Shape &output_shape, T *output_data)
 {
   reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
                                              input2_data, output_shape, output_data,
                                              GetBinaryArtithmeticFn<op_type, T>());
 }
 
-template <BinaryArithmeticOpType op_type>
-inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                                        const uint8_t *input1_data, const Shape &input2_shape,
-                                        const uint8_t *input2_data, const Shape &output_shape,
-                                        uint8_t *output_data)
+template <BinaryArithmeticOpType op_type, typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                            const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                            const Shape &output_shape, T *output_data)
 {
   switch (op_type)
   {
     case nnfw::cker::BinaryArithmeticOpType::ADD:
     case nnfw::cker::BinaryArithmeticOpType::SUB:
-      optimized::BroadcastAddDispatchQuant8(params, input1_shape, input1_data, input2_shape,
-                                            input2_data, output_shape, output_data);
+      optimized::BroadcastAddDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+                                      output_shape, output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::MUL:
-      optimized::BroadcastMulDispatchQuant8(
-        params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape,
-        const_cast<uint8_t *>(input2_data), output_shape, output_data);
+      optimized::BroadcastMulDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+                                      output_shape, output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::DIV:
     case nnfw::cker::BinaryArithmeticOpType::POW:
diff --git a/compute/cker/include/cker/operation/BroadcastTo.h b/compute/cker/include/cker/operation/BroadcastTo.h
index 5068eca96..145deda29 100644
--- a/compute/cker/include/cker/operation/BroadcastTo.h
+++ b/compute/cker/include/cker/operation/BroadcastTo.h
@@ -126,7 +126,7 @@ template <typename Device, typename T> struct BroadcastTo
     }
   }
 };
-} // functor
+} // namespace functor
 
 template <typename T>
 inline void BroadcastTo(const Shape &input_shape, T *input_data, const Shape &output_shape,
diff --git a/compute/cker/include/cker/operation/Conv.h b/compute/cker/include/cker/operation/Conv.h
index b20bac3ac..16c937a27 100644
--- a/compute/cker/include/cker/operation/Conv.h
+++ b/compute/cker/include/cker/operation/Conv.h
@@ -138,6 +138,17 @@ public:
     }
   }
 
+  void operator()(const ConvParams &params, const Shape &input_shape, const int8_t *input_data,
+                  const Shape &filter_shape, const int8_t *filter_data, const Shape &bias_shape,
+                  const int32_t *bias_data, const Shape &output_shape, int8_t *output_data)
+  {
+    reference::Conv(params, _per_channel_output_multiplier.data(), _per_channel_output_shift.data(),
+                    input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
+                    output_shape, output_data);
+  }
+  std::vector<int32_t> &per_channel_output_multiplier() { return _per_channel_output_multiplier; }
+  std::vector<int> &per_channel_output_shift() { return _per_channel_output_shift; }
+
 private:
   bool usableMultiThreaded(PaddingType padding_type, uint32_t dilation_width_factor,
                            int32_t dilation_height_factor)
@@ -180,6 +191,9 @@ private:
   Shape _im2col_shape;
   bool _need_im2col;
   bool _prepared;
+  // Per channel output multiplier and shift.
+  std::vector<int32_t> _per_channel_output_multiplier;
+  std::vector<int> _per_channel_output_shift;
 };
 } // namespace cker
 } // namespace nnfw
diff --git a/compute/cker/include/cker/operation/DepthwiseConv.h b/compute/cker/include/cker/operation/DepthwiseConv.h
index 436ddd8c9..06ee780bb 100644
--- a/compute/cker/include/cker/operation/DepthwiseConv.h
+++ b/compute/cker/include/cker/operation/DepthwiseConv.h
@@ -24,6 +24,7 @@
 #include "cker/neon/neon_check.h"
 #include "cker/operation/optimized/DepthwiseConvFloat.h"
 #include "cker/operation/optimized/DepthwiseConvUint8.h"
+#include "cker/operation/optimized/integer_ops/DepthwiseConvInt8.h"
 #include "cker/CpuBackendThreadpool.h"
 
 namespace nnfw
diff --git a/compute/cker/include/cker/operation/Einsum.h b/compute/cker/include/cker/operation/Einsum.h
index 13fccfd15..6721a7508 100644
--- a/compute/cker/include/cker/operation/Einsum.h
+++ b/compute/cker/include/cker/operation/Einsum.h
@@ -177,7 +177,7 @@ inline Shape copyShape(const Shape &shape)
 {
   return Shape::ExtendedShape(shape.DimensionsCount(), shape);
 }
-}
+} // namespace
 
 class Einsum
 {
diff --git a/compute/cker/include/cker/operation/Fill.h b/compute/cker/include/cker/operation/Fill.h
index d657acc12..f88c3a5fb 100644
--- a/compute/cker/include/cker/operation/Fill.h
+++ b/compute/cker/include/cker/operation/Fill.h
@@ -24,7 +24,8 @@ namespace nnfw
 {
 namespace cker
 {
-template <typename T> inline void Fill(const T value_data, const Shape &output_shape, T output_data)
+template <typename T>
+inline void Fill(const T *value_data, const Shape &output_shape, T *output_data)
 {
   int output_size = output_shape.FlatSize();
   for (int i = 0; i < output_size; i++)
diff --git a/compute/cker/include/cker/operation/Helper/RandomDistributions.h b/compute/cker/include/cker/operation/Helper/RandomDistributions.h
index cbebff142..f16e5019d 100644
--- a/compute/cker/include/cker/operation/Helper/RandomDistributions.h
+++ b/compute/cker/include/cker/operation/Helper/RandomDistributions.h
@@ -772,7 +772,7 @@ PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1)
 }
 
 } // namespace random
-} // namespace tensorflow
-}
+} // namespace cker
+} // namespace nnfw
 
 #endif // __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__
diff --git a/compute/cker/include/cker/operation/Helper/RandomOp.h b/compute/cker/include/cker/operation/Helper/RandomOp.h
index 7dc51fe94..6b7049ddf 100644
--- a/compute/cker/include/cker/operation/Helper/RandomOp.h
+++ b/compute/cker/include/cker/operation/Helper/RandomOp.h
@@ -47,6 +47,6 @@ template <class Distribution> struct FillPhiloxRandom<CPUDevice, Distribution>
 };
 
 } // namespace functor
-} // namespace tensorflow
-}
+} // namespace cker
+} // namespace nnfw
 #endif // __NNFW_CKER_HELPER_RANDOM_OP_H__
diff --git a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
index 6e9ffbdfd..c99f69709 100644
--- a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
+++ b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
@@ -157,7 +157,7 @@ operator()(random::PhiloxRandom gen, typename Distribution::ResultElementType *d
 
 } // namespace functor
 
-} // end namespace tensorflow
-}
+} // namespace cker
+} // namespace nnfw
 
 #endif // __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__
diff --git a/compute/cker/include/cker/operation/Quantize.h b/compute/cker/include/cker/operation/Quantize.h
index 5c82d111f..8e5fc22bb 100644
--- a/compute/cker/include/cker/operation/Quantize.h
+++ b/compute/cker/include/cker/operation/Quantize.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.*
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +21,10 @@
 #include "cker/Shape.h"
 #include "cker/Types.h"
 #include "cker/Utils.h"
-#include <stdexcept>
+#include <cassert>
 #include <iostream>
+#include <stdexcept>
+
 namespace nnfw
 {
 namespace cker
@@ -41,6 +44,251 @@ inline void Quantize(const Shape &input_shape, const InputT *input_data, const S
     output_data[i] = clamped;
   }
 }
+
+inline void Quantize(const int32_t *multiplier, const int32_t *shift, int32_t channel_size,
+                     int32_t total_size, int32_t output_zp, int32_t output_min, int32_t output_max,
+                     int32_t *scratch, int8_t *output)
+{
+  // Here we're trying to quantize the raw accumulators:
+  //        output_channels
+  //       data data data data data
+  // rows  data data data data data
+  //       data data data data data
+  //          ....
+  //
+  // In order to minimize the reload of the multipliers & shifts, once we load
+  // the multipliers & shifts, we load & quantize the raw accumulators for every
+  // row.
+#ifdef USE_NEON
+  const int32x4_t output_offset_vec = vdupq_n_s32(output_zp);
+  const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min);
+  const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max);
+  const int32x4_t zeros = vdupq_n_s32(0);
+#endif
+
+  assert(total_size % channel_size == 0);
+  const int32_t rows = total_size / channel_size;
+
+  int c = 0;
+
+#ifdef USE_NEON
+  using gemmlowp::RoundingDivideByPOT;
+  for (; c <= channel_size - 8; c += 8)
+  {
+    int32x4_t out_shift_1 = vld1q_s32(shift + c);
+    int32x4_t out_shift_2 = vld1q_s32(shift + c + 4);
+    int32x4_t left_shift_1 = vmaxq_s32(out_shift_1, zeros);
+    int32x4_t left_shift_2 = vmaxq_s32(out_shift_2, zeros);
+
+    // Right shift will be performed as left shift with negative values.
+    int32x4_t right_shift_1 = vminq_s32(out_shift_1, zeros);
+    int32x4_t right_shift_2 = vminq_s32(out_shift_2, zeros);
+
+    int32x4_t out_mul_1 = vld1q_s32(multiplier + c);
+    int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4);
+    for (int n = 0; n < rows; ++n)
+    {
+      int loc = n * channel_size + c;
+      int32x4_t acc_1 = vld1q_s32(scratch + loc);
+      int32x4_t acc_2 = vld1q_s32(scratch + loc + 4);
+
+      // Saturating Rounding Doubling High Mul.
+      acc_1 = vshlq_s32(acc_1, left_shift_1);
+      acc_1 = vqrdmulhq_s32(acc_1, out_mul_1);
+      acc_2 = vshlq_s32(acc_2, left_shift_2);
+      acc_2 = vqrdmulhq_s32(acc_2, out_mul_2);
+
+      // Rounding Dividing By POT.
+      acc_1 = vrshlq_s32(acc_1, right_shift_1);
+      acc_2 = vrshlq_s32(acc_2, right_shift_2);
+
+      // Add the output offset.
+      acc_1 = vaddq_s32(acc_1, output_offset_vec);
+      acc_2 = vaddq_s32(acc_2, output_offset_vec);
+
+      // Apply the activation function.
+      acc_1 = vmaxq_s32(acc_1, output_activation_min_vec);
+      acc_1 = vminq_s32(acc_1, output_activation_max_vec);
+      acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
+      acc_2 = vminq_s32(acc_2, output_activation_max_vec);
+
+      // Saturating cast to int8 and store to destination.
+      const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
+      const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
+      const int16x8_t res_s16 = vcombine_s16(acc_s16_1, acc_s16_2);
+      const int8x8_t res_s8 = vqmovn_s16(res_s16);
+      vst1_s8(output + loc, res_s8);
+    }
+  }
+
+#endif // USE_NEON
+  // Handle leftover values, one by one. This is very slow.
+  for (; c < channel_size; c++)
+  {
+    for (int n = 0; n < rows; ++n)
+    {
+      int loc = n * channel_size + c;
+      int32_t acc = scratch[loc];
+      acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
+      acc += output_zp;
+      acc = std::max(acc, output_min);
+      acc = std::min(acc, output_max);
+      output[loc] = static_cast<int8_t>(acc);
+    }
+  }
+}
+
+template <typename input_type, typename output_type>
+inline void Requantize(const input_type *input_data, int32_t size,
+                       int32_t effective_scale_multiplier, int32_t effective_scale_shift,
+                       int32_t input_zeropoint, int32_t output_zeropoint, output_type *output_data)
+{
+  assert(!"Requantize: not supported type. It shouldn't reach here.");
+  UNUSED_ALL(input_data, size, effective_scale_multiplier, effective_scale_shift, input_zeropoint,
+             output_zeropoint, output_data);
+}
+
+template <>
+inline void Requantize<uint8_t, int8_t>(const uint8_t *input_data, int32_t size,
+                                        int32_t effective_scale_multiplier,
+                                        int32_t effective_scale_shift, int32_t input_zeropoint,
+                                        int32_t output_zeropoint, int8_t *output_data)
+{
+  static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min();
+  static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  // Constants.
+  const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+  const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+  const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+  const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+
+  for (; i <= size - 16; i += 16)
+  {
+    const uint8x16_t input_vec = vld1q_u8(input_data + i);
+    const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
+    const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
+    int32x4x4_t input;
+    input.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half)));
+    input.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(first_half)));
+    input.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(second_half)));
+    input.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(second_half)));
+    input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
+    input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
+    input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
+    input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
+
+    int32x4x4_t result =
+      MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift);
+
+    result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
+    result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
+    result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
+    result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
+    result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
+    result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
+    result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
+    result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
+
+    const int16x4_t narrowed_val_1 = vqmovn_s32(result.val[0]);
+    const int16x4_t narrowed_val_2 = vqmovn_s32(result.val[1]);
+    const int16x4_t narrowed_val_3 = vqmovn_s32(result.val[2]);
+    const int16x4_t narrowed_val_4 = vqmovn_s32(result.val[3]);
+    const int16x8_t output_first_half = vcombine_s16(narrowed_val_1, narrowed_val_2);
+    const int16x8_t output_second_half = vcombine_s16(narrowed_val_3, narrowed_val_4);
+    const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
+    const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
+    const int8x16_t narrowed_result = vcombine_s8(narrowed_first_half, narrowed_second_half);
+    vst1q_s8(output_data + i, narrowed_result);
+  }
+
+#endif
+  for (; i < size; ++i)
+  {
+    const int32_t input = input_data[i] - input_zeropoint;
+    const int32_t output =
+      MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, effective_scale_shift) +
+      output_zeropoint;
+    const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput);
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+template <>
+inline void Requantize<int8_t, uint8_t>(const int8_t *input_data, int32_t size,
+                                        int32_t effective_scale_multiplier,
+                                        int32_t effective_scale_shift, int32_t input_zeropoint,
+                                        int32_t output_zeropoint, uint8_t *output_data)
+{
+  static constexpr int32_t kMinOutput = std::numeric_limits<uint8_t>::min();
+  static constexpr int32_t kMaxOutput = std::numeric_limits<uint8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  // Constants.
+  const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+  const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+  const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+  const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+
+  for (; i <= size - 16; i += 16)
+  {
+    const int8x16_t input_vec = vld1q_s8(input_data + i);
+    const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
+    const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
+    int32x4x4_t input;
+    input.val[0] = vmovl_s16(vget_low_s16(first_half));
+    input.val[1] = vmovl_s16(vget_high_s16(first_half));
+    input.val[2] = vmovl_s16(vget_low_s16(second_half));
+    input.val[3] = vmovl_s16(vget_high_s16(second_half));
+    input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
+    input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
+    input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
+    input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
+
+    int32x4x4_t result =
+      MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift);
+
+    result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
+    result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
+    result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
+    result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
+    result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
+    result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
+    result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
+    result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
+
+    const uint32x4_t result_val_1_unsigned = vreinterpretq_u32_s32(result.val[0]);
+    const uint32x4_t result_val_2_unsigned = vreinterpretq_u32_s32(result.val[1]);
+    const uint32x4_t result_val_3_unsigned = vreinterpretq_u32_s32(result.val[2]);
+    const uint32x4_t result_val_4_unsigned = vreinterpretq_u32_s32(result.val[3]);
+
+    const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
+    const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
+    const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
+    const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
+    const uint16x8_t output_first_half = vcombine_u16(narrowed_val_1, narrowed_val_2);
+    const uint16x8_t output_second_half = vcombine_u16(narrowed_val_3, narrowed_val_4);
+    const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
+    const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
+    const uint8x16_t narrowed_result = vcombine_u8(narrowed_first_half, narrowed_second_half);
+    vst1q_u8(output_data + i, narrowed_result);
+  }
+
+#endif
+  for (; i < size; ++i)
+  {
+    const int32_t input = input_data[i] - input_zeropoint;
+    const int32_t output =
+      MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, effective_scale_shift) +
+      output_zeropoint;
+    const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
 } // namespace cker
 } // namespace nnfw
 
diff --git a/compute/cker/include/cker/operation/Reduce.h b/compute/cker/include/cker/operation/Reduce.h
index dbf938147..f54f2e6f1 100644
--- a/compute/cker/include/cker/operation/Reduce.h
+++ b/compute/cker/include/cker/operation/Reduce.h
@@ -46,6 +46,7 @@ inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape
     input_size *= input_dims[idx];
   }
   reduce_size = input_dims[input_num_dims - 1];
+  int offset = 0;
   for (int idx = 0; idx < input_size; idx++)
   {
     int r_idx = 0;
@@ -55,14 +56,14 @@ inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape
     float32x4_t tmp_data_32x4 = vld1q_f32(tmp_data);
     for (; r_idx <= reduce_size - 32; r_idx += 32)
     {
-      float32x4_t a10 = vld1q_f32(input_data + r_idx);
-      float32x4_t a11 = vld1q_f32(input_data + r_idx + 4);
-      float32x4_t a12 = vld1q_f32(input_data + r_idx + 8);
-      float32x4_t a13 = vld1q_f32(input_data + r_idx + 12);
-      float32x4_t a20 = vld1q_f32(input_data + r_idx + 16);
-      float32x4_t a21 = vld1q_f32(input_data + r_idx + 20);
-      float32x4_t a22 = vld1q_f32(input_data + r_idx + 24);
-      float32x4_t a23 = vld1q_f32(input_data + r_idx + 28);
+      float32x4_t a10 = vld1q_f32(input_data + offset + r_idx);
+      float32x4_t a11 = vld1q_f32(input_data + offset + r_idx + 4);
+      float32x4_t a12 = vld1q_f32(input_data + offset + r_idx + 8);
+      float32x4_t a13 = vld1q_f32(input_data + offset + r_idx + 12);
+      float32x4_t a20 = vld1q_f32(input_data + offset + r_idx + 16);
+      float32x4_t a21 = vld1q_f32(input_data + offset + r_idx + 20);
+      float32x4_t a22 = vld1q_f32(input_data + offset + r_idx + 24);
+      float32x4_t a23 = vld1q_f32(input_data + offset + r_idx + 28);
 
       float32x4_t x0 = vaddq_f32(a10, a20);
       float32x4_t x1 = vaddq_f32(a11, a21);
@@ -74,10 +75,23 @@ inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape
       float32x4_t y2 = vaddq_f32(y0, y1);
       tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y2);
     }
+    for (; r_idx <= reduce_size - 16; r_idx += 16)
+    {
+      float32x4_t a10 = vld1q_f32(input_data + offset + r_idx);
+      float32x4_t a11 = vld1q_f32(input_data + offset + r_idx + 4);
+      float32x4_t a12 = vld1q_f32(input_data + offset + r_idx + 8);
+      float32x4_t a13 = vld1q_f32(input_data + offset + r_idx + 12);
+
+      float32x4_t x0 = vaddq_f32(a10, a11);
+      float32x4_t x1 = vaddq_f32(a12, a13);
+
+      float32x4_t y0 = vaddq_f32(x0, x1);
+      tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y0);
+    }
     for (; r_idx <= reduce_size - 8; r_idx += 8)
     {
-      float32x4_t a1 = vld1q_f32(input_data + r_idx);
-      float32x4_t a2 = vld1q_f32(input_data + r_idx + 4);
+      float32x4_t a1 = vld1q_f32(input_data + offset + r_idx);
+      float32x4_t a2 = vld1q_f32(input_data + offset + r_idx + 4);
       float32x4_t x = vaddq_f32(a1, a2);
       tmp_data_32x4 = vaddq_f32(tmp_data_32x4, x);
     }
@@ -88,13 +102,14 @@ inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape
     {
       if (r_idx == 0)
       {
-        output_data[idx] = input_data[idx * reduce_size];
+        output_data[idx] = input_data[offset];
       }
       else
       {
-        output_data[idx] += input_data[idx * reduce_size + r_idx];
+        output_data[idx] += input_data[offset + r_idx];
       }
     }
+    offset += reduce_size;
   }
 }
 #endif // NEON
diff --git a/compute/cker/include/cker/operation/ResizeBilinear.h b/compute/cker/include/cker/operation/ResizeBilinear.h
index 8d9a7495f..ae5af7bb3 100644
--- a/compute/cker/include/cker/operation/ResizeBilinear.h
+++ b/compute/cker/include/cker/operation/ResizeBilinear.h
@@ -264,6 +264,91 @@ void ResizeBilinear(ResizeBilinearParams &params, const Shape &input_shape,
     batches, input_height, input_width, depth, params.output_height, params.output_width,
     height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers);
 }
+
+inline void ComputeInterpolationValues(const int32_t value, const int32_t scale_10,
+                                       const bool half_pixel_centers, int32_t input_size,
+                                       int32_t *scaled_value, int32_t *lower_bound,
+                                       int32_t *upper_bound)
+{
+  if (half_pixel_centers)
+  {
+    *scaled_value = value * scale_10 + scale_10 / 2 - (1 << 9);
+  }
+  else
+  {
+    *scaled_value = value * scale_10;
+  }
+  *lower_bound = std::max(*scaled_value / (1 << 10), 0);
+  *upper_bound = std::min(*scaled_value / (1 << 10) + 1, input_size - 1);
+}
+
+inline void ResizeBilinear(const ResizeBilinearParams &op_params,
+                           const Shape &unextended_input_shape, const int8_t *input_data,
+                           const Shape &unextended_output_shape, int8_t *output_data)
+{
+  // If half_pixel_centers is True, align_corners must be False.
+  assert(!op_params.half_pixel_centers || !op_params.align_corners);
+  assert(unextended_input_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int32_t input_height = input_shape.Dims(1);
+  const int32_t input_width = input_shape.Dims(2);
+  const int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  const int32_t output_height = op_params.output_height;
+  const int32_t output_width = op_params.output_width;
+
+  int32_t height_scale_10 = ((1 << 10) * input_height + output_height / 2) / output_height;
+  int32_t width_scale_10 = ((1 << 10) * input_width + output_width / 2) / output_width;
+  if (op_params.align_corners && output_height > 1)
+  {
+    height_scale_10 =
+      ((1 << 10) * (input_height - 1) + (output_height - 1) / 2) / (output_height - 1);
+  }
+  if (op_params.align_corners && output_width > 1)
+  {
+    width_scale_10 = ((1 << 10) * (input_width - 1) + (output_width - 1) / 2) / (output_width - 1);
+  }
+
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int y = 0; y < output_height; ++y)
+    {
+      int32_t input_y, y0, y1;
+      ComputeInterpolationValues(y, height_scale_10, op_params.half_pixel_centers, input_height,
+                                 &input_y, &y0, &y1);
+      for (int x = 0; x < output_width; ++x)
+      {
+        int32_t input_x, x0, x1;
+        ComputeInterpolationValues(x, width_scale_10, op_params.half_pixel_centers, input_width,
+                                   &input_x, &x0, &x1);
+        for (int c = 0; c < depth; ++c)
+        {
+          const int64_t output_20_ll =
+            static_cast<int64_t>(input_data[Offset(input_shape, b, y0, x0, c)]) *
+            ((1 << 10) - (input_y - (1 << 10) * y0)) * ((1 << 10) - (input_x - (1 << 10) * x0));
+          const int64_t output_20_lu =
+            static_cast<int64_t>(input_data[Offset(input_shape, b, y1, x0, c)]) *
+            (input_y - (1 << 10) * y0) * ((1 << 10) - (input_x - (1 << 10) * x0));
+          const int64_t output_20_rl =
+            static_cast<int64_t>(input_data[Offset(input_shape, b, y0, x1, c)]) *
+            ((1 << 10) - (input_y - (1 << 10) * y0)) * (input_x - (1 << 10) * x0);
+          const int64_t output_20_ru =
+            static_cast<int64_t>(input_data[Offset(input_shape, b, y1, x1, c)]) *
+            (input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0);
+          const int64_t output_20 = output_20_ll + output_20_lu + output_20_rl + output_20_ru;
+          const int64_t round = (output_20 > 0) ? (1 << 19) : -(1 << 19);
+          const int8_t interpolation = static_cast<int8_t>((output_20 + round) / (1 << 20));
+          output_data[Offset(output_shape, b, y, x, c)] = interpolation;
+        }
+      }
+    }
+  }
+}
+
 } // namespace cker
 } // namespace nnfw
 
diff --git a/compute/cker/include/cker/operation/SoftMax.h b/compute/cker/include/cker/operation/SoftMax.h
index 620c1f968..35ecde4ba 100644
--- a/compute/cker/include/cker/operation/SoftMax.h
+++ b/compute/cker/include/cker/operation/SoftMax.h
@@ -23,6 +23,10 @@
 #include "cker/Types.h"
 #include "cker/eigen/Utils.h"
 
+#if __aarch64__ && __clang__
+#define TFLITE_SOFTMAX_USE_UINT16_LUT
+#endif
+
 #include <Eigen/Core>
 #include <fixedpoint/fixedpoint.h>
 #include <cmath>
@@ -69,7 +73,7 @@ inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const
     }
   }
 }
-}
+} // namespace reference
 
 // Performs softmax along the input of size (input_size * batch_size).
 inline void Softmax(const float *in, const int input_size, const int batch_size, const float beta,
@@ -127,87 +131,306 @@ inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const
   out_mat.array().rowwise() *= scale;
 }
 
-inline void Softmax(const SoftmaxParams &params, const Shape &input_shape,
-                    const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
-{
-  const int32_t input_beta_multiplier = params.input_multiplier;
-  const int32_t input_beta_left_shift = params.input_left_shift;
-  const int diff_min = params.diff_min;
-  // The representation chosen for the input to the exp() function is Q5.26.
-  // We need to leave extra space since values that we skip might be as large as
-  // -32 before multiplying by input_beta_multiplier, and therefore as large as
-  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
-  // accumulation, but exp(-16) definitely is.
-  static const int kScaledDiffIntegerBits = 5;
-  static const int kAccumulationIntegerBits = 12;
-  using FixedPointScaledDiff = gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
-  using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
+template <typename T> inline int32_t QuantizeSoftmaxOutput(float prob_rescaled, int32_t zero_point)
+{
+  const int32_t prob_rnd = static_cast<int32_t>(std::round(prob_rescaled));
+  return prob_rnd + zero_point;
+}
+
+#if !__aarch64__
+// With ARM64, rounding is faster than add + truncation.
+template <> inline int32_t QuantizeSoftmaxOutput<uint8_t>(float prob_rescaled, int32_t)
+{
+  return static_cast<int32_t>(prob_rescaled + 0.5f);
+}
+#endif
+
+inline void PopulateSoftmaxLookupTable(float *table, float input_scale, float beta)
+{
+  const float scale = -input_scale * beta;
+  const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+  for (int32_t val = 0; val <= max_uint8; ++val)
+  {
+    table[max_uint8 - val] = expf(scale * val);
+  }
+}
 
+template <typename In, typename Out>
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const In *input_data,
+                    const Shape &output_shape, Out *output_data)
+{
   const int trailing_dim = input_shape.DimensionsCount() - 1;
-  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  const int excluding_last_dim = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int last_dim = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
 
-  for (int i = 0; i < outer_size; ++i)
+  const int32_t clamp_max = std::numeric_limits<Out>::max();
+  const int32_t clamp_min = std::numeric_limits<Out>::min();
+  for (int i = 0; i < excluding_last_dim; ++i)
   {
-    uint8_t max_in_row = 0;
-    for (int c = 0; c < depth; ++c)
+    int32_t max_val = std::numeric_limits<In>::min();
+    // Find max quantized value.
+    for (int j = 0; j < last_dim; ++j)
     {
-      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+      max_val = std::max(max_val, static_cast<int32_t>(input_data[j]));
     }
 
-    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
-    for (int c = 0; c < depth; ++c)
+    float sum_exp = 0.0f;
+    const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+    const float *table_offset = &params.table[max_uint8 - max_val];
+    // Calculate normalizer sum(exp(x)).
+    for (int j = 0; j < last_dim; ++j)
     {
-      int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
-      if (input_diff >= diff_min)
-      {
-        const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
-          input_diff, input_beta_multiplier, input_beta_left_shift);
-        const FixedPointScaledDiff scaled_diff_f8 =
-          FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
-                                      exp_on_negative_values(scaled_diff_f8));
-      }
+      sum_exp += table_offset[input_data[j]];
     }
 
-    int32_t fixed_sum_of_exps = sum_of_exps.raw();
-    int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(fixed_sum_of_exps));
-    // This is the number of bits to the left of the binary point above 1.0.
-    // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
-    // no later adjustment will be needed.
-    int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
-    int32_t shifted_sum_minus_one =
-      static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) -
-                           (static_cast<uint32_t>(1) << 31));
+    const float inv_sum_exp = 1.0f / (sum_exp * params.scale);
+    // Normalize and quantize probabilities.
+    for (int j = 0; j < last_dim; ++j)
+    {
+      const float prob_rescaled = table_offset[input_data[j]] * inv_sum_exp;
+      const int32_t prob_quantized = QuantizeSoftmaxOutput<Out>(prob_rescaled, params.zero_point);
+      output_data[j] = static_cast<Out>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
+    }
+    input_data += last_dim;
+    output_data += last_dim;
+  }
+}
+
+#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
+// Looks up each element of <indices> in <table>, returns them in a vector.
+inline uint8x16_t aarch64_lookup_vector(const uint8x16x4_t table[4], uint8x16_t indices)
+{
+  // Look up in 1st quarter of the table: top 2 bits of indices == 00
+  uint8x16_t output1 = vqtbl4q_u8(table[0], indices);
+  // Look up in 2nd quarter of the table: top 2 bits of indices == 01
+  uint8x16_t output2 = vqtbl4q_u8(table[1], veorq_u8(indices, vdupq_n_u8(0x40)));
+  // Look up in 3rd quarter of the table: top 2 bits of indices == 10
+  uint8x16_t output3 = vqtbl4q_u8(table[2], veorq_u8(indices, vdupq_n_u8(0x80)));
+  // Look up in 4th quarter of the table: top 2 bits of indices == 11
+  uint8x16_t output4 = vqtbl4q_u8(table[3], veorq_u8(indices, vdupq_n_u8(0xc0)));
+
+  // Combine result of the 4 lookups.
+  return vorrq_u8(vorrq_u8(output1, output2), vorrq_u8(output3, output4));
+}
 
-    FixedPoint0 shifted_scale =
-      one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one));
+inline void PopulateSoftmaxUInt8LookupTable(uint8_t *uint8_table1, uint8_t *uint8_table2,
+                                            float input_scale, float beta)
+{
+  const float scale = input_scale * beta;
+  const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+  const int32_t max_uint16 = std::numeric_limits<uint16_t>::max();
 
-    for (int c = 0; c < depth; ++c)
+  for (int32_t val = 0; val <= max_uint8; ++val)
+  {
+    float input_to_exp = scale * (val - max_uint8);
+    int32_t temp = static_cast<int>(expf(input_to_exp) * max_uint16 + 0.5);
+    temp = std::min(max_uint16, temp);
+    uint8_t part1 = temp >> 8;
+    uint8_t part2 = temp & 0xff;
+    uint8_table1[val] = static_cast<uint8_t>(part1);
+    uint8_table2[val] = static_cast<uint8_t>(part2);
+  }
+}
+
+inline int FindMaxValue(int size, const uint8_t *input_data, uint8_t offset)
+{
+  int32_t max_val = std::numeric_limits<uint8_t>::min();
+  int j = 0;
+
+  uint8x16_t max_val_dup = vdupq_n_u8(max_val);
+  uint8x16_t offset_dup = vdupq_n_u8(offset);
+  for (; j <= size - 16; j += 16)
+  {
+    uint8x16_t input_value = vld1q_u8(input_data + j);
+    input_value = veorq_u8(input_value, offset_dup);
+    max_val_dup = vmaxq_u8(input_value, max_val_dup);
+  }
+  max_val = std::max(max_val, static_cast<int32_t>(vmaxvq_u8(max_val_dup)));
+
+  for (; j < size; ++j)
+  {
+    max_val = std::max(max_val, static_cast<int32_t>(input_data[j] ^ offset));
+  }
+  return max_val;
+}
+
+#ifdef USE_NEON
+// Value_to_store layout:
+// [high_high, high_low, low_high, low_low].
+inline void StoreValue(int32x4x4_t value_to_store, int8_t *output)
+{
+  const int16x8_t result_1 =
+    vcombine_s16(vqmovn_s32(value_to_store.val[1]), vqmovn_s32(value_to_store.val[0]));
+  const int16x8_t result_2 =
+    vcombine_s16(vqmovn_s32(value_to_store.val[3]), vqmovn_s32(value_to_store.val[2]));
+  const int8x16_t result = vcombine_s8(vqmovn_s16(result_2), vqmovn_s16(result_1));
+  vst1q_s8(output, result);
+}
+
+// Value_to_store layout:
+// [high_high, high_low, low_high, low_low].
+inline void StoreValue(int32x4x4_t value_to_store, uint8_t *output)
+{
+  const uint16x8_t result_1 =
+    vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[1])),
+                 vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[0])));
+  const uint16x8_t result_2 =
+    vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[3])),
+                 vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[2])));
+  const uint8x16_t result = vcombine_u8(vqmovn_u16(result_2), vqmovn_u16(result_1));
+  vst1q_u8(output, result);
+}
+
+#endif
+
+template <typename In, typename Out>
+inline void SoftmaxInt8LUT(const SoftmaxParams &params, const Shape &input_shape,
+                           const In *input_data, const Shape &output_shape, Out *output_data)
+{
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int excluding_last_dim = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int last_dim = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  const int32_t clamp_max = std::numeric_limits<Out>::max();
+  const int32_t clamp_min = std::numeric_limits<Out>::min();
+
+  // Offset is used to interpret the input data "correctly".
+  // If the input is uint8, the data will be unchanged.
+  // If the input is int8, since it will be reinterpret as uint8.
+  // e.g.,
+  // int8 127 will be applied "offset" to become 255 in uint8.
+  uint8_t offset = 0;
+  if (std::is_same<In, int8_t>::value)
+  {
+    offset = 0x80;
+  }
+
+  const uint8_t *input_data_uint = reinterpret_cast<const uint8_t *>(input_data);
+
+  // This code uses ARM64-only instructions.
+  // TODO(b/143709993): Port to ARMv7
+
+  // Load the tables into registers. (4*4 128-bit registers)
+  uint8x16x4_t table1[4];
+  table1[0] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 0);
+  table1[1] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 1);
+  table1[2] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 2);
+  table1[3] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 3);
+
+  uint8x16x4_t table2[4];
+  table2[0] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 0);
+  table2[1] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 1);
+  table2[2] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 2);
+  table2[3] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 3);
+
+  for (int i = 0; i < excluding_last_dim; ++i)
+  {
+    // Find max quantized value.
+    int32_t max_val = FindMaxValue(last_dim, input_data_uint, offset);
+
+    int32_t sum_exp = 0;
+    const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+    const uint8_t table_offset = max_uint8 - max_val;
+
+    // Calculate normalizer sum(exp(x)).
+    int sum_j = 0;
+    uint8x16_t table_offset_dup = vdupq_n_u8(table_offset);
+    uint8x16_t offset_dup = vdupq_n_u8(offset);
+    uint32x4_t sum_4 = vdupq_n_u32(0);
+    const int multiplier_shift = 8;
+    for (; sum_j <= last_dim - 16; sum_j += 16)
     {
-      int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
-      if (input_diff >= diff_min)
-      {
-        const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
-          input_diff, input_beta_multiplier, input_beta_left_shift);
-        const FixedPointScaledDiff scaled_diff_f8 =
-          FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-
-        FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
-        int32_t unsat_output = gemmlowp::RoundingDivideByPOT((shifted_scale * exp_in_0).raw(),
-                                                             num_bits_over_unit + 31 - 8);
-
-        output_data[i * depth + c] = static_cast<uint8_t>(
-          std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0)));
-      }
-      else
-      {
-        output_data[i * depth + c] = 0;
-      }
+      uint8x16_t input_value = vld1q_u8(input_data_uint + sum_j);
+      input_value = veorq_u8(input_value, offset_dup);
+      input_value = vaddq_u8(input_value, table_offset_dup);
+
+      const uint8x16_t output1 = aarch64_lookup_vector(table1, input_value);
+      const uint8x16_t output2 = aarch64_lookup_vector(table2, input_value);
+
+      uint16x8_t exp_value1 = vshll_n_u8(vget_high_u8(output1), multiplier_shift);
+      uint16x8_t exp_value2 = vshll_n_u8(vget_low_u8(output1), multiplier_shift);
+
+      exp_value1 = vaddw_u8(exp_value1, vget_high_u8(output2));
+      exp_value2 = vaddw_u8(exp_value2, vget_low_u8(output2));
+
+      sum_4 = vpadalq_u16(sum_4, exp_value1);
+      sum_4 = vpadalq_u16(sum_4, exp_value2);
+    }
+    int temp = vgetq_lane_u32(sum_4, 0) + vgetq_lane_u32(sum_4, 1) + vgetq_lane_u32(sum_4, 2) +
+               vgetq_lane_u32(sum_4, 3);
+    sum_exp += temp;
+
+    for (; sum_j < last_dim; ++sum_j)
+    {
+      const uint8_t index = (input_data_uint[sum_j] ^ offset) + table_offset;
+
+      uint8_t part1 = params.uint8_table1[index];
+      uint8_t part2 = params.uint8_table2[index];
+      sum_exp += ((part1 << 8) + part2);
+    }
+
+    const float inv_sum_exp = 1.0f / (sum_exp * params.scale);
+
+    int32_t multiplier, shift;
+    QuantizeMultiplier(inv_sum_exp, &multiplier, &shift);
+
+    // Normalize and quantize probabilities.
+    int j = 0;
+    const int32x4_t output_zp_dup = vdupq_n_s32(params.zero_point);
+    const int32x4_t max_val_dup = vdupq_n_s32(clamp_max);
+    const int32x4_t min_val_dup = vdupq_n_s32(clamp_min);
+
+    for (; j <= last_dim - 16; j += 16)
+    {
+      uint8x16_t input_value = vld1q_u8(input_data_uint + j);
+      input_value = veorq_u8(input_value, offset_dup);
+      input_value = vaddq_u8(input_value, table_offset_dup);
+
+      const uint8x16_t output1 = aarch64_lookup_vector(table1, input_value);
+      const uint8x16_t output2 = aarch64_lookup_vector(table2, input_value);
+
+      uint16x8_t exp_value1 = vshll_n_u8(vget_high_u8(output1), multiplier_shift);
+      uint16x8_t exp_value2 = vshll_n_u8(vget_low_u8(output1), multiplier_shift);
+
+      exp_value1 = vaddw_u8(exp_value1, vget_high_u8(output2));
+      exp_value2 = vaddw_u8(exp_value2, vget_low_u8(output2));
+
+      int32x4x4_t output_value;
+      output_value.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(exp_value1)));
+      output_value.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(exp_value1)));
+      output_value.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(exp_value2)));
+      output_value.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(exp_value2)));
+
+      int32x4x4_t temp_val = MultiplyByQuantizedMultiplier4Rows(output_value, multiplier, shift);
+
+      temp_val.val[0] = vaddq_s32(temp_val.val[0], output_zp_dup);
+      temp_val.val[1] = vaddq_s32(temp_val.val[1], output_zp_dup);
+      temp_val.val[2] = vaddq_s32(temp_val.val[2], output_zp_dup);
+      temp_val.val[3] = vaddq_s32(temp_val.val[3], output_zp_dup);
+
+      temp_val.val[0] = vmaxq_s32(vminq_s32(temp_val.val[0], max_val_dup), min_val_dup);
+      temp_val.val[1] = vmaxq_s32(vminq_s32(temp_val.val[1], max_val_dup), min_val_dup);
+      temp_val.val[2] = vmaxq_s32(vminq_s32(temp_val.val[2], max_val_dup), min_val_dup);
+      temp_val.val[3] = vmaxq_s32(vminq_s32(temp_val.val[3], max_val_dup), min_val_dup);
+
+      StoreValue(temp_val, output_data + j);
+    }
+    for (; j < last_dim; ++j)
+    {
+      const uint8_t index = (input_data_uint[j] ^ offset) + table_offset;
+      const uint8_t part1 = params.uint8_table1[index];
+      const uint8_t part2 = params.uint8_table2[index];
+      const int32_t exp_value = (part1 << 8) + part2;
+      const int32_t output_value = MultiplyByQuantizedMultiplier(exp_value, multiplier, shift);
+
+      output_data[j] = static_cast<Out>(
+        std::max(std::min(clamp_max, output_value + params.zero_point), clamp_min));
     }
+    input_data_uint += last_dim;
+    output_data += last_dim;
   }
 }
+#endif
 
 } // namespace cker
 } // namespace nnfw
diff --git a/compute/cker/include/cker/operation/StatelessRandomUniform.h b/compute/cker/include/cker/operation/StatelessRandomUniform.h
index cdd812a08..dcf649ca1 100644
--- a/compute/cker/include/cker/operation/StatelessRandomUniform.h
+++ b/compute/cker/include/cker/operation/StatelessRandomUniform.h
@@ -72,8 +72,8 @@ void Fill(random::PhiloxRandom random, Tensor *output)
                                                     Distribution());
 }
 
-inline void StatelessRandomUniform(const Shape &shape_shape, const int *shape_data,
-                                   const Shape &seed_shape, const int *seed_data,
+inline void StatelessRandomUniform(const Shape &shape_shape, const int32_t *shape_data,
+                                   const Shape &seed_shape, const int32_t *seed_data,
                                    const Shape &output_shape, float *output_data)
 {
   Tensor shape_t;
diff --git a/compute/cker/include/cker/operation/Transpose.h b/compute/cker/include/cker/operation/Transpose.h
index 62eb432ae..52c826c39 100644
--- a/compute/cker/include/cker/operation/Transpose.h
+++ b/compute/cker/include/cker/operation/Transpose.h
@@ -288,7 +288,7 @@ size_t Flatten(const Shape &input_shape, const Shape &output_shape, const Transp
   return flat_size;
 }
 
-} // namespace anonymous (util)
+} // namespace
 
 // Transpose2D only deals with typical 2D matrix transpose ops.
 // Perform transpose by transposing 4x4 blocks of the input, proceeding from
diff --git a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
index 8c1d31b56..1fe3e1517 100644
--- a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
@@ -35,6 +35,7 @@ namespace cker
 namespace optimized
 {
 
+/* Old version: For Sub(float) and Div. */
 template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
 inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &params, bool switch_inputs,
                                     const Shape & /* unswitched_input1_shape */,
@@ -122,8 +123,108 @@ inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &params, bool
   }
 }
 
-inline int32_t quant8_sum(const BinaryArithmeticOpParam &params, const uint8_t input1_data,
-                          const uint8_t input2_data)
+// New version: For Mul, Add and Sub(quant8)
+template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
+inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &unswitched_params,
+                                    const Shape & /* unswitched_input1_shape */,
+                                    const T *unswitched_input1_data,
+                                    const Shape & /* unswitched_input2_shape */,
+                                    const T *unswitched_input2_data,
+                                    const Shape & /* output_shape */, T *output_data,
+                                    ElementwiseF elementwise_f, ScalarBroadcastF scalar_broadcast_f)
+{
+  BinaryArithmeticOpParam switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
+  switched_params.input1_shift = unswitched_params.input2_shift;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
+  switched_params.input2_shift = unswitched_params.input1_shift;
+
+  const bool use_unswitched =
+    unswitched_params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const BinaryArithmeticOpParam &params = use_unswitched ? unswitched_params : switched_params;
+  const T *input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const T *input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise add of
+  // sections of the arrays.
+  T *output_data_ptr = output_data;
+  const T *input1_data_ptr = input1_data;
+  const T *input2_data_reset = input2_data;
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for
+  // input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  if (y4 > 1)
+  {
+    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+    // dimension.
+    for (int i0 = 0; i0 < y0; ++i0)
+    {
+      const T *input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1)
+      {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2)
+        {
+          for (int i3 = 0; i3 < y3; ++i3)
+          {
+            elementwise_f(y4, params, input1_data_ptr, input2_data_ptr, output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          // We have broadcast y4 of input1 data y3 times, and now move on.
+          input1_data_ptr += y4;
+        }
+      }
+      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+      input2_data_reset = input2_data_ptr;
+    }
+  }
+  else
+  {
+    // Special case of y4 == 1, in which the innermost loop is a single
+    // element and can be combined with the next (y3) as an inner broadcast.
+    //
+    // Note that this handles the case of pure scalar broadcast when
+    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+    // broadcast with batch (as y2 > 1).
+    //
+    // NOTE The process is the same as the above general case except
+    // simplified for y4 == 1 and the loop over y3 is contained within the
+    // AddScalarBroadcast function.
+    for (int i0 = 0; i0 < y0; ++i0)
+    {
+      const T *input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1)
+      {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2)
+        {
+          scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr, output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          input1_data_ptr += 1;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
+    }
+  }
+}
+
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value, int32_t>
+quant8_sum(const BinaryArithmeticOpParam &params, const T input1_data, const T input2_data)
 {
   const int32_t input1_val = params.input1_offset + input1_data;
   const int32_t input2_val = params.input2_offset + input2_data;
@@ -142,9 +243,9 @@ inline int32_t quant8_sum(const BinaryArithmeticOpParam &params, const uint8_t i
   return clamped_output;
 }
 
-inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params,
-                                 const uint8_t *input1_data, const uint8_t *input2_data,
-                                 uint8_t *output_data)
+inline void AddElementwise(int size, const BinaryArithmeticOpParam &params,
+                           const uint8_t *input1_data, const uint8_t *input2_data,
+                           uint8_t *output_data)
 {
   int i = 0;
 
@@ -218,6 +319,119 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
   }
 }
 
+inline void AddElementwise(int size, const BinaryArithmeticOpParam &params,
+                           const int8_t *input1_data, const int8_t *input2_data,
+                           int8_t *output_data)
+{
+  int i = 0;
+#ifdef USE_NEON
+  const int8x16_t output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min);
+  const int8x16_t output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max);
+
+  const int input1_left_shift = params.left_shift + params.input1_shift;
+  const int input2_left_shift = params.left_shift + params.input2_shift;
+  const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift);
+  const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift);
+
+  const int16x8_t input1_offset_dup = vdupq_n_s16(params.input1_offset);
+  const int16x8_t input2_offset_dup = vdupq_n_s16(params.input2_offset);
+
+  for (; i <= size - 16; i += 16)
+  {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+
+    const int16x8_t input1_val_s16_high = vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low = vmovl_s8(vget_low_s8(input1_val_original));
+
+    const int16x8_t input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high = vaddq_s16(input1_val_s16_high, input1_offset_dup);
+    const int16x8_t input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_dup);
+    const int16x8_t input1_val_low = vaddq_s16(input1_val_s16_low, input1_offset_dup);
+    const int16x8_t input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_dup);
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+    int32x4_t x111 = vmovl_s16(input1_val_low_low);
+    int32x4_t x112 = vmovl_s16(input1_val_low_high);
+    int32x4_t x121 = vmovl_s16(input1_val_high_low);
+    int32x4_t x122 = vmovl_s16(input1_val_high_high);
+    int32x4_t x211 = vmovl_s16(input2_val_low_low);
+    int32x4_t x212 = vmovl_s16(input2_val_low_high);
+    int32x4_t x221 = vmovl_s16(input2_val_high_low);
+    int32x4_t x222 = vmovl_s16(input2_val_high_high);
+
+    x111 = vshlq_s32(x111, input1_left_dup);
+    x112 = vshlq_s32(x112, input1_left_dup);
+    x121 = vshlq_s32(x121, input1_left_dup);
+    x122 = vshlq_s32(x122, input1_left_dup);
+    x211 = vshlq_s32(x211, input2_left_dup);
+    x212 = vshlq_s32(x212, input2_left_dup);
+    x221 = vshlq_s32(x221, input2_left_dup);
+    x222 = vshlq_s32(x222, input2_left_dup);
+    x111 = vqrdmulhq_n_s32(x111, params.input1_multiplier);
+    x112 = vqrdmulhq_n_s32(x112, params.input1_multiplier);
+    x121 = vqrdmulhq_n_s32(x121, params.input1_multiplier);
+    x122 = vqrdmulhq_n_s32(x122, params.input1_multiplier);
+    x211 = vqrdmulhq_n_s32(x211, params.input2_multiplier);
+    x212 = vqrdmulhq_n_s32(x212, params.input2_multiplier);
+    x221 = vqrdmulhq_n_s32(x221, params.input2_multiplier);
+    x222 = vqrdmulhq_n_s32(x222, params.input2_multiplier);
+    int32x4_t s11 = vaddq_s32(x111, x211);
+    int32x4_t s12 = vaddq_s32(x112, x212);
+    int32x4_t s21 = vaddq_s32(x121, x221);
+    int32x4_t s22 = vaddq_s32(x122, x222);
+    s11 = vqrdmulhq_n_s32(s11, params.output_multiplier);
+    s12 = vqrdmulhq_n_s32(s12, params.output_multiplier);
+    s21 = vqrdmulhq_n_s32(s21, params.output_multiplier);
+    s22 = vqrdmulhq_n_s32(s22, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    s11 = RoundingDivideByPOT(s11, -params.output_shift);
+    s12 = RoundingDivideByPOT(s12, -params.output_shift);
+    s21 = RoundingDivideByPOT(s21, -params.output_shift);
+    s22 = RoundingDivideByPOT(s22, -params.output_shift);
+    const int16x4_t s11_narrowed = vmovn_s32(s11);
+    const int16x4_t s12_narrowed = vmovn_s32(s12);
+    const int16x4_t s21_narrowed = vmovn_s32(s21);
+    const int16x4_t s22_narrowed = vmovn_s32(s22);
+    const int16x8_t s1 =
+      vaddq_s16(vcombine_s16(s11_narrowed, s12_narrowed), vdupq_n_s16(params.output_offset));
+    const int16x8_t s2 =
+      vaddq_s16(vcombine_s16(s21_narrowed, s22_narrowed), vdupq_n_s16(params.output_offset));
+    const int8x16_t s = vcombine_s8(vqmovn_s16(s1), vqmovn_s16(s2));
+
+    const int8x16_t clamped =
+      vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, s));
+    vst1q_s8(output_data + i, clamped);
+  }
+#endif // NEON
+
+  for (; i < size; ++i)
+  {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                                 raw_sum, params.output_multiplier, params.output_shift) +
+                               params.output_offset;
+    const int32_t clamped_output = std::min(params.quantized_activation_max,
+                                            std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
 struct BinaryOpFuncAddFloat
 {
 #ifdef USE_NEON
@@ -473,12 +687,13 @@ getBinaryOpWithActivationImplFloat(const BinaryArithmeticOpParam &params)
                                   BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatMinMax>);
 }
 
-inline void AddQuant8(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                      const uint8_t *input1_data, const Shape &input2_shape,
-                      const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data)
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+Add(const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
+    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
 {
   const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  AddElementwiseQuant8(flat_size, params, input1_data, input2_data, output_data);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
 inline void Add(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -493,9 +708,9 @@ inline void Add(const BinaryArithmeticOpParam &params, const Shape &input1_shape
 // Scalar-broadcast add that can be used for inner loop of more general
 // broadcast add, so that, for example, scalar-broadcast with batch will still
 // be fast.
-inline void AddScalarBroadcastQuant8(int size, const BinaryArithmeticOpParam &params,
-                                     uint8_t broadcast_value, const uint8_t *input2_data,
-                                     uint8_t *output_data)
+inline void AddScalarBroadcast(int size, const BinaryArithmeticOpParam &params,
+                               uint8_t broadcast_value, const uint8_t *input2_data,
+                               uint8_t *output_data)
 {
   int i = 0;
   int32_t clamped_output;
@@ -506,31 +721,115 @@ inline void AddScalarBroadcastQuant8(int size, const BinaryArithmeticOpParam &pa
   }
 }
 
-inline void BroadcastAddDispatchQuant8(const BinaryArithmeticOpParam &params,
-                                       const Shape &input1_shape, const uint8_t *input1_data,
-                                       const Shape &input2_shape, const uint8_t *input2_data,
-                                       const Shape &output_shape, uint8_t *output_data)
+// Scalar-broadcast add that can be used for inner loop of more general
+// broadcast add, so that, for example, scalar-broadcast with batch will still
+// be fast.
+inline void AddScalarBroadcast(int size, const BinaryArithmeticOpParam &params, int8_t input1_data,
+                               const int8_t *input2_data, int8_t *output_data)
 {
-  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
+  using gemmlowp::RoundingDivideByPOT;
+  int i = 0;
+#ifdef USE_NEON
+  const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
+  const int8x8_t output_activation_min_vector = vdup_n_s8(params.quantized_activation_min);
+  const int8x8_t output_activation_max_vector = vdup_n_s8(params.quantized_activation_max);
+
+  // Process broadcast scalar.
+  const int8x8_t input1_val_original = vdup_n_s8(input1_data);
+  const int16x8_t input1_val_s16 = vmovl_s8(input1_val_original);
+  const int16x8_t input1_val = vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
+  const int16x4_t input1_val_high = vget_high_s16(input1_val);
+  const int16x4_t input1_val_low = vget_low_s16(input1_val);
+  int32x4_t x11 = vmovl_s16(input1_val_low);
+  int32x4_t x12 = vmovl_s16(input1_val_high);
+  x11 = vshlq_s32(x11, left_shift_dup);
+  x12 = vshlq_s32(x12, left_shift_dup);
+  x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
+  x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
+  const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
+  x11 = vshlq_s32(x11, input1_shift_dup);
+  x12 = vshlq_s32(x12, input1_shift_dup);
+
+  for (; i <= size - 8; i += 8)
   {
-    const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)>
-      fn =
-        [](const BinaryArithmeticOpParam &params, const uint8_t &a, const uint8_t &b) -> uint8_t {
-      return static_cast<uint8_t>(quant8_sum(params, a, b));
-    };
-    reference::BroadcastBinaryArithmeticOpSlowQuant8(
-      params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn);
+    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
+    const int16x8_t input2_val_s16 = vmovl_s8(input2_val_original);
+    const int16x8_t input2_val = vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
+    const int16x4_t input2_val_high = vget_high_s16(input2_val);
+    const int16x4_t input2_val_low = vget_low_s16(input2_val);
+    int32x4_t x21 = vmovl_s16(input2_val_low);
+    int32x4_t x22 = vmovl_s16(input2_val_high);
+    x21 = vshlq_s32(x21, left_shift_dup);
+    x22 = vshlq_s32(x22, left_shift_dup);
+    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
+    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
+    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
+    x21 = vshlq_s32(x21, input2_shift_dup);
+    x22 = vshlq_s32(x22, input2_shift_dup);
+    int32x4_t s1 = vaddq_s32(x11, x21);
+    int32x4_t s2 = vaddq_s32(x12, x22);
+    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
+    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
+    s1 = RoundingDivideByPOT(s1, -params.output_shift);
+    s2 = RoundingDivideByPOT(s2, -params.output_shift);
+    const int16x4_t s1_narrowed = vmovn_s32(s1);
+    const int16x4_t s2_narrowed = vmovn_s32(s2);
+    const int16x8_t s =
+      vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset));
+    const int8x8_t clamped =
+      vmax_s8(output_activation_min_vector, vmin_s8(output_activation_max_vector, vqmovn_s16(s)));
+    vst1_s8(output_data + i, clamped);
   }
-  else
+#endif // NEON
+
+  if (i < size)
   {
-    BinaryBroadcastFiveFold(
-      params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
-      input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-      static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
-                           uint8_t *)>(AddElementwiseQuant8),
-      static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
-                           uint8_t *)>(AddScalarBroadcastQuant8));
+    // Process broadcast scalar.
+    const int32_t input1_val = params.input1_offset + input1_data;
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      shifted_input1_val, params.input1_multiplier, params.input1_shift);
+
+    for (; i < size; ++i)
+    {
+      const int32_t input2_val = params.input2_offset + input2_data[i];
+      const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+      const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+        shifted_input2_val, params.input2_multiplier, params.input2_shift);
+      const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+      const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                                   raw_sum, params.output_multiplier, params.output_shift) +
+                                 params.output_offset;
+      const int32_t clamped_output = std::min(
+        params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output));
+      output_data[i] = static_cast<int8_t>(clamped_output);
+    }
+  }
+}
+
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+BroadcastAddDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                     const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                     const Shape &output_shape, T *output_data)
+{
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
+  {
+    const std::function<T(const BinaryArithmeticOpParam &, const T &, const T &)> fn =
+      [](const BinaryArithmeticOpParam &params, const T &a, const T &b) {
+        return static_cast<T>(quant8_sum(params, a, b));
+      };
+    reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+                                               input2_data, output_shape, output_data, fn);
+    return;
   }
+
+  BinaryBroadcastFiveFold(
+    params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+    static_cast<void (*)(int, const BinaryArithmeticOpParam &, const T *, const T *, T *)>(
+      AddElementwise),
+    static_cast<void (*)(int, const BinaryArithmeticOpParam &, T, const T *, T *)>(
+      AddScalarBroadcast));
 }
 
 inline void BroadcastAddDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -592,8 +891,9 @@ inline void BroadcastSubDispatch(const BinaryArithmeticOpParam &params, const Sh
   }
 }
 
-inline int32_t quant8_mul(const BinaryArithmeticOpParam &params, const uint8_t input1_data,
-                          const uint8_t input2_data)
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value, int32_t>
+quant8_mul(const BinaryArithmeticOpParam &params, const T input1_data, const T input2_data)
 {
   const int32_t input1_val = params.input1_offset + input1_data;
   const int32_t input2_val = params.input2_offset + input2_data;
@@ -607,9 +907,9 @@ inline int32_t quant8_mul(const BinaryArithmeticOpParam &params, const uint8_t i
   return clamped_output;
 }
 
-inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam &params,
-                                 const uint8_t *input1_data, const uint8_t *input2_data,
-                                 uint8_t *output_data)
+inline void MulElementwise(int size, const BinaryArithmeticOpParam &params,
+                           const uint8_t *input1_data, const uint8_t *input2_data,
+                           uint8_t *output_data)
 {
   int i = 0;
 
@@ -671,12 +971,102 @@ inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
   }
 }
 
-inline void MulQuant8(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                      const uint8_t *input1_data, const Shape &input2_shape,
-                      const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data)
+inline void MulElementwise(int size, const BinaryArithmeticOpParam &params,
+                           const int8_t *input1_data, const int8_t *input2_data,
+                           int8_t *output_data)
+{
+  int i = 0;
+#ifdef USE_NEON
+  const int16x8_t input1_offset_vector = vdupq_n_s16(params.input1_offset);
+  const int16x8_t input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const int16x8_t output_offset_vector = vdupq_n_s16(params.output_offset);
+  const auto output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min);
+  const auto output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max);
+  const int left_shift = std::max(0, params.output_shift);
+  const int right_shift = std::max(0, -params.output_shift);
+  const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
+  for (; i <= size - 16; i += 16)
+  {
+    // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+
+    const int16x8_t input1_val_s16_high = vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low = vmovl_s8(vget_low_s8(input1_val_original));
+
+    const int16x8_t input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high = vaddq_s16(input1_val_s16_high, input1_offset_vector);
+    const int16x8_t input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_vector);
+    const int16x8_t input1_val_low = vaddq_s16(input1_val_s16_low, input1_offset_vector);
+    const int16x8_t input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_vector);
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+
+    auto p1 = vmull_s16(input2_val_high_high, input1_val_high_high);
+    auto p2 = vmull_s16(input2_val_high_low, input1_val_high_low);
+    auto p3 = vmull_s16(input2_val_low_high, input1_val_low_high);
+    auto p4 = vmull_s16(input2_val_low_low, input1_val_low_low);
+
+    p1 = vshlq_s32(p1, left_shift_vec);
+    p2 = vshlq_s32(p2, left_shift_vec);
+    p3 = vshlq_s32(p3, left_shift_vec);
+    p4 = vshlq_s32(p4, left_shift_vec);
+
+    p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+    p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+    p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    p1 = RoundingDivideByPOT(p1, right_shift);
+    p2 = RoundingDivideByPOT(p2, right_shift);
+    p3 = RoundingDivideByPOT(p3, right_shift);
+    p4 = RoundingDivideByPOT(p4, right_shift);
+
+    const auto p1_narrowed = vqmovn_s32(p1);
+    const auto p2_narrowed = vqmovn_s32(p2);
+    const auto p3_narrowed = vqmovn_s32(p3);
+    const auto p4_narrowed = vqmovn_s32(p4);
+
+    const int16x8_t p_part1 =
+      vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+    const int16x8_t p_part2 =
+      vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+    const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+    const auto clamped =
+      vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, p));
+    vst1q_s8(output_data + i, clamped);
+  }
+#endif // NEON
+
+  for (; i < size; ++i)
+  {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
+      params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                                           params.output_multiplier,
+                                                           params.output_shift);
+    const int32_t clamped_output = std::min(
+      params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+Mul(const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
+    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
 {
   const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  MulElementwiseQuant8(flat_size, params, input1_data, input2_data, output_data);
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
 inline void Mul(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -688,9 +1078,9 @@ inline void Mul(const BinaryArithmeticOpParam &params, const Shape &input1_shape
   (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
 }
 
-inline void MulSimpleBroadcastQuant8(int size, const BinaryArithmeticOpParam &params,
-                                     const uint8_t broadcast_value, const uint8_t *input2_data,
-                                     uint8_t *output_data)
+inline void MulSimpleBroadcast(int size, const BinaryArithmeticOpParam &params,
+                               const uint8_t broadcast_value, const uint8_t *input2_data,
+                               uint8_t *output_data)
 {
   int i = 0;
   int32_t clamped_output;
@@ -701,29 +1091,109 @@ inline void MulSimpleBroadcastQuant8(int size, const BinaryArithmeticOpParam &pa
   }
 }
 
-inline void BroadcastMulDispatchQuant8(const BinaryArithmeticOpParam &params,
-                                       const Shape &input1_shape, const uint8_t *input1_data,
-                                       const Shape &input2_shape, const uint8_t *input2_data,
-                                       const Shape &output_shape, uint8_t *output_data)
+// Broadcast mul that can often be used for inner loop of broadcast Mul.
+inline void MulSimpleBroadcast(int size, const BinaryArithmeticOpParam &params,
+                               const int8_t broadcast_value, const int8_t *input2_data,
+                               int8_t *output_data)
+{
+  const int16_t input1_val = params.input1_offset + broadcast_value;
+
+  int i = 0;
+#ifdef USE_NEON
+  const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const auto output_offset_vector = vdupq_n_s16(params.output_offset);
+  const auto output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min);
+  const auto output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max);
+  const int left_shift = std::max(0, params.output_shift);
+  const int right_shift = std::max(0, -params.output_shift);
+  const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
+  for (; i <= size - 16; i += 16)
+  {
+    // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+    const auto input2_val_original = vld1q_s8(input2_data + i);
+    const auto input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original));
+    const auto input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
+
+    const auto input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_vector);
+    const auto input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_vector);
+
+    const auto input2_val_low_low = vget_low_s16(input2_val_low);
+    const auto input2_val_low_high = vget_high_s16(input2_val_low);
+    const auto input2_val_high_low = vget_low_s16(input2_val_high);
+    const auto input2_val_high_high = vget_high_s16(input2_val_high);
+
+    auto p1 = vmull_n_s16(input2_val_high_high, input1_val);
+    auto p2 = vmull_n_s16(input2_val_high_low, input1_val);
+    auto p3 = vmull_n_s16(input2_val_low_high, input1_val);
+    auto p4 = vmull_n_s16(input2_val_low_low, input1_val);
+
+    p1 = vshlq_s32(p1, left_shift_vec);
+    p2 = vshlq_s32(p2, left_shift_vec);
+    p3 = vshlq_s32(p3, left_shift_vec);
+    p4 = vshlq_s32(p4, left_shift_vec);
+
+    p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+    p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+    p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    p1 = RoundingDivideByPOT(p1, right_shift);
+    p2 = RoundingDivideByPOT(p2, right_shift);
+    p3 = RoundingDivideByPOT(p3, right_shift);
+    p4 = RoundingDivideByPOT(p4, right_shift);
+
+    const auto p1_narrowed = vqmovn_s32(p1);
+    const auto p2_narrowed = vqmovn_s32(p2);
+    const auto p3_narrowed = vqmovn_s32(p3);
+    const auto p4_narrowed = vqmovn_s32(p4);
+
+    const int16x8_t p_part1 =
+      vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+    const int16x8_t p_part2 =
+      vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+    const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+    const auto clamped =
+      vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, p));
+    vst1q_s8(output_data + i, clamped);
+  }
+#endif // NEON
+
+  for (; i < size; ++i)
+  {
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
+      params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                                           params.output_multiplier,
+                                                           params.output_shift);
+    const int32_t clamped_output = std::min(
+      params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                     const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                     const Shape &output_shape, T *output_data)
 {
   if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
   {
-    const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)>
-      fn =
-        [](const BinaryArithmeticOpParam &params, const uint8_t &a, const uint8_t &b) -> uint8_t {
-      return static_cast<uint8_t>(quant8_mul(params, a, b));
-    };
-    reference::BroadcastBinaryArithmeticOpSlowQuant8(
-      params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn);
+    const std::function<T(const BinaryArithmeticOpParam &, const T &, const T &)> fn =
+      [](const BinaryArithmeticOpParam &params, const T &a, const T &b) {
+        return static_cast<T>(quant8_mul(params, a, b));
+      };
+    reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+                                               input2_data, output_shape, output_data, fn);
     return;
   }
   BinaryBroadcastFiveFold(
-    params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
-    input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-    static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
-                         uint8_t *)>(MulElementwiseQuant8),
-    static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
-                         uint8_t *)>(MulSimpleBroadcastQuant8));
+    params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+    static_cast<void (*)(int, const BinaryArithmeticOpParam &, const T *, const T *, T *)>(
+      MulElementwise),
+    static_cast<void (*)(int, const BinaryArithmeticOpParam &, T, const T *, T *)>(
+      MulSimpleBroadcast));
 }
 
 inline void BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -741,10 +1211,8 @@ inline void BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Sh
     return;
   }
   auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params);
-  BinaryBroadcastFiveFold(
-    params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
-    input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-    implFuncs.first, implFuncs.second);
+  BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape, input2_data,
+                          output_shape, output_data, implFuncs.first, implFuncs.second);
 }
 
 inline void Div(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
diff --git a/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h
index d4397933a..17b2fc7a2 100644
--- a/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h
+++ b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h
@@ -1243,8 +1243,8 @@ inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const Shape &in
   }
 }
 
-} // nnfw
-} // cker
-} // optimized
+} // namespace optimized
+} // namespace cker
+} // namespace nnfw
 
 #endif
diff --git a/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h b/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h
new file mode 100644
index 000000000..bd8497920
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h
@@ -0,0 +1,2138 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__
+#define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__
+
+#include "cker/CpuBackendThreadpool.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/neon/neon_check.h"
+#include "cker/operation/Quantize.h"
+
+#include <fixedpoint/fixedpoint.h>
+#include <public/gemmlowp.h>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized_integer_ops
+{
+
+// Category of depthwise convolution output rounding.
+enum class DepthwiseConvOutputRounding
+{
+  kNone = 0,     // Invalid: specific method must be specified.
+  kAwayFromZero, // Original method: exact halves rounded away from zero.
+  kUpward,       // Halves towards +infinity: adds 0.5 before truncate.
+  // This is where a future kNearestEven would be placed.
+};
+
+// Category of depthwise convolution depth multiplication.
+enum class DepthwiseConvDepthMultiplication
+{
+  kNoMultiplication = 0, // Depth multiplier = 1.
+  kUnitInputDepth,       // Input depth = 1, output depth = depth multiplier.
+};
+
+namespace depthwise_conv
+{
+
+// Implementation of quantized DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct QuantizedDepthwiseConvKernel
+{
+};
+
+#ifdef USE_NEON
+template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8x2_t filter_s8;
+    filter_s8.val[0] = vld1_s8(filter_ptr);
+    filter_s8.val[1] = vld1_s8(filter_ptr + 8);
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vmovl_s8(filter_s8.val[i]);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4x2_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+      }
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += input_ptr_increment;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[0].val[i] =
+          vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));
+        acc[1].val[i] =
+          vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+        vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+    const int16x8_t filter = vmovl_s8(filter_s8);
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input_s8[i] = vld1_s8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vmovl_s8(input_s8[i]);
+      }
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
+      acc[3] = vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      acc[0] = vld1q_s32(acc_buffer_ptr);
+      acc[1] = vld1q_s32(acc_buffer_ptr + 4);
+
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc[0]);
+      vst1q_s32(acc_buffer_ptr + 4, acc[1]);
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+    const int16x8_t filter = vmovl_s8(filter_s8);
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[2 * i + 0] =
+          vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));
+        acc[2 * i + 1] =
+          vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4x2_t input_dup2 = vzip_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i);
+      filter[i] = vmovl_s8(filter_s8);
+    }
+    int outp = 0;
+    // Handle two output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 8; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input_s8[i] = vld1_s8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vmovl_s8(input_s8[i]);
+      }
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer.
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer.
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32_t input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32_t input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vmlal_n_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        const int8x8_t input_s8 = vld1_s8(input_ptr + 8 * i);
+        const int16x8_t input_s16 = vmovl_s8(input_s8);
+        input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      }
+      input_ptr += 16;
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
+        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc;
+      acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i);
+      filter[i] = vmovl_s8(filter_s8);
+    }
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), vget_high_s16(input), 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
+{
+  static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // We will have to duplicate bytes in a NEON register, 3-fold.
+    // We will do that by register-level table-look-up using VTBL instructions.
+    // Here we prepare the registers containing the table-lookup indices.
+    static const int8_t dup3_indices_array[3][8] = {
+      {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};
+    int8x8_t dup3_indices[3];
+    for (int i = 0; i < 3; i++)
+    {
+      dup3_indices[i] = vld1_s8(dup3_indices_array[i]);
+    }
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const int8_t *local_filter_ptr = filter_ptr;
+      const int8_t *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8)
+      {
+        // Load the filters.
+        int16x8_t filter[3];
+        int8x8x3_t filter_s8;
+        filter_s8.val[0] = vld1_s8(local_filter_ptr);
+        filter_s8.val[1] = vld1_s8(local_filter_ptr + 8);
+        filter_s8.val[2] = vld1_s8(local_filter_ptr + 16);
+        local_filter_ptr += 24;
+        for (int i = 0; i < 3; i++)
+        {
+          filter[i] = vmovl_s8(filter_s8.val[i]);
+        }
+        // Load the inputs, duplicate 3-fold, add input_offset.
+        const int8x8_t input_s8 = vld1_s8(local_input_ptr);
+        local_input_ptr += 8;
+
+        int8x8_t input_s8_dup3[3];
+        for (int i = 0; i < 3; i++)
+        {
+          input_s8_dup3[i] = vtbl1_s8(input_s8, dup3_indices[i]);
+        }
+        int16x8_t input_dup3[3];
+        for (int i = 0; i < 3; i++)
+        {
+          const int16x8_t input_s16_dup3 = vmovl_s8(input_s8_dup3[i]);
+          input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
+        }
+        // Load the accumulators from acc_buffer
+        int32x4x3_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+          acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
+        }
+        // Multiply-accumulate
+        for (int j = 0; j < 3; j++)
+        {
+          acc[0].val[j] =
+            vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));
+          acc[1].val[j] =
+            vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++)
+        {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
+        }
+        acc_buffer_ptr += 24;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 3; i++)
+        {
+          *acc_buffer_ptr++ += static_cast<int32_t>(local_filter_ptr[i]) * input_val;
+        }
+        local_filter_ptr += 3;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
+{
+  static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const int8_t *local_filter_ptr = filter_ptr;
+      const int8_t *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8)
+      {
+        // Load the filters.
+        int16x8_t filter[2];
+        int8x8x2_t filter_s8;
+        filter_s8.val[0] = vld1_s8(local_filter_ptr);
+        filter_s8.val[1] = vld1_s8(local_filter_ptr + 8);
+        local_filter_ptr += 16;
+        for (int i = 0; i < 2; i++)
+        {
+          filter[i] = vmovl_s8(filter_s8.val[i]);
+        }
+        // Load the inputs, add input_offset, duplicate 2-fold.
+        const int8x8_t input_s8 = vld1_s8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vmovl_s8(input_s8);
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+        // Load the accumulators from acc_buffer.
+        int32x4x2_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+        }
+        // Multiply-accumulate.
+        for (int j = 0; j < 2; j++)
+        {
+          acc[0].val[j] =
+            vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));
+          acc[1].val[j] =
+            vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));
+        }
+        // Store the accumulators back to acc_buffer.
+        for (int i = 0; i < 2; i++)
+        {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        // Load the inputs.
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 2; i++)
+        {
+          *acc_buffer_ptr++ += static_cast<int32_t>(local_filter_ptr[i]) * input_val;
+        }
+        local_filter_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const int8_t *local_filter_ptr = filter_ptr;
+      const int8_t *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16)
+      {
+        // Load the filters.
+        int8x8_t filter_s8_0 = vld1_s8(local_filter_ptr + 8 * 0);
+        int8x8_t filter_s8_1 = vld1_s8(local_filter_ptr + 8 * 1);
+        local_filter_ptr += 16;
+        int16x8_t filter_0 = vmovl_s8(filter_s8_0);
+        int16x8_t filter_1 = vmovl_s8(filter_s8_1);
+        // Load the inputs, add input_offset.
+        int8x8_t input_s8_0 = vld1_s8(local_input_ptr + 8 * 0);
+        int8x8_t input_s8_1 = vld1_s8(local_input_ptr + 8 * 1);
+        local_input_ptr += 16;
+        int16x8_t input_0 = vmovl_s8(input_s8_0);
+        int16x8_t input_1 = vmovl_s8(input_s8_1);
+        input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+        input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+        // Load the accumulators from acc_buffer
+        int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+        int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+        int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+        int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+        acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
+        acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
+        acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
+        acc_3 = vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
+        // Store the accumulators back to acc_buffer
+        vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+        vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+        vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+        vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+        acc_buffer_ptr += 16;
+      }
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8)
+      {
+        // Load the filters.
+        const int8x8_t filter_s8 = vld1_s8(local_filter_ptr);
+        local_filter_ptr += 8;
+        const int16x8_t filter = vmovl_s8(filter_s8);
+        // Load the inputs, add input_offset.
+        const int8x8_t input_s8 = vld1_s8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vmovl_s8(input_s8);
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        // Load the accumulators from acc_buffer
+        int32x4_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+        acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++)
+        {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        const int16_t filter_val = *local_filter_ptr++;
+        *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter_s8[i] = vld1_s8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vmovl_s8(filter_s8[i]);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input_s8[i] = vld1_s8(input_ptr + 8 * i);
+      }
+      input_ptr += input_ptr_increment;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vmovl_s8(input_s8[i]);
+      }
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i]));
+        acc[2 * i + 1] =
+          vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+    const int16x8_t filter = vmovl_s8(filter_s8);
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter_s8[i] = vld1_s8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vmovl_s8(filter_s8[i]);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      int8_t input_s8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[2 * i + 0] = vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
+        acc[2 * i + 1] = vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0);
+    int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1);
+    int8x8_t filter_s8_2 = vld1_s8(filter_ptr + 8 * 2);
+    int8x8_t filter_s8_3 = vld1_s8(filter_ptr + 8 * 3);
+    int16x8_t filter_0 = vmovl_s8(filter_s8_0);
+    int16x8_t filter_1 = vmovl_s8(filter_s8_1);
+    int16x8_t filter_2 = vmovl_s8(filter_s8_2);
+    int16x8_t filter_3 = vmovl_s8(filter_s8_3);
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      int8_t input_s8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+      int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
+      int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
+      int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
+      // Multiply-accumulate
+      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+      acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
+      acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
+      acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
+      acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+      vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
+      vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
+      vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
+      acc_buffer_ptr += 32;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
+    // We load the first 16 bytes into filter_s8_{0,1} as usual.
+    // Then we load the 8 last bytes into filter_s8_x  (x for 'extra').
+    // This is redundant: the first 4 bytes of filter_s8_x are the same
+    // as the last 4 bytes of filter_s8_x.
+    int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0);
+    int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1);
+    int8x8_t filter_s8_x = vld1_s8(filter_ptr + 8 * 1 + 4);
+    int16x8_t filter_0 = vmovl_s8(filter_s8_0);
+    int16x8_t filter_1 = vmovl_s8(filter_s8_1);
+    int16x8_t filter_x = vmovl_s8(filter_s8_x);
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      int8_t input_s8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+      // Multiply-accumulate
+      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+      acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+      acc_buffer_ptr += 20;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+    const int16x8_t filter = vmovl_s8(filter_s8);
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      int8_t input_s8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
+      acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      int16x4_t input_s16 = vdup_n_s16(0);
+      input_s16 = vset_lane_s16((reinterpret_cast<const int16_t *>(input_ptr))[0], input_s16, 0);
+      input_ptr += input_ptr_increment;
+      input_s16 = vset_lane_s16((reinterpret_cast<const int16_t *>(input_ptr))[0], input_s16, 1);
+      input_ptr += input_ptr_increment;
+      input_s16 = vget_low_s16(vmovl_s8(vreinterpret_s8_s16(input_s16)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer.
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer.
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    if (num_output_pixels <= 0)
+    {
+      return;
+    }
+
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+
+    // Handle one output pixel at a time until second to the last pixel. Second
+    // to the last because we read eight input pixels while only processing
+    // four.
+    for (; outp < num_output_pixels - 1; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc;
+      acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+
+    // Handle the last output pixel.
+    // Load the accumulators from acc_buffer
+    int32x4_t acc;
+    acc = vld1q_s32(acc_buffer_ptr);
+
+    // Load the inputs, add input_offset.
+    int8x8_t input_s8 = vdup_n_s8(0);
+    input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+    input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+    input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+    input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+    const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+    const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+    // Multiply-accumulate
+    acc = vmlal_s16(acc, filter, input);
+    // Store the accumulators back to acc_buffer
+    vst1q_s32(acc_buffer_ptr, acc);
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 12, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8_0 = vld1_s8(filter_ptr);
+    int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 4);
+    int16x8_t filter_s16_0 = vmovl_s8(filter_s8_0);
+    int16x8_t filter_s16_1 = vmovl_s8(filter_s8_1);
+    int16x4_t filter_0 = vget_low_s16(filter_s16_0);
+    int16x4_t filter_1 = vget_high_s16(filter_s16_0);
+    int16x4_t filter_2 = vget_high_s16(filter_s16_1);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8_0 = vld1_s8(input_ptr);
+      int8x8_t input_s8_1 = vld1_s8(input_ptr + 4);
+      input_ptr += input_ptr_increment;
+      int16x8_t input_0 = vmovl_s8(input_s8_0);
+      int16x8_t input_1 = vmovl_s8(input_s8_1);
+      input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+      input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+
+      // Multiply-accumulate
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
+
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+
+      acc_buffer_ptr += 12;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth,
+                                    int input_width, const int8_t *input_data, int16_t input_offset,
+                                    int pad_width, int depth_multiplier, int filter_width,
+                                    const int8_t *filter_data, int out_x_buffer_start,
+                                    int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
+{
+  // Consistency check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  assert(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth)
+  {
+    assert(input_depth == kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier)
+  {
+    assert(depth_multiplier == kFixedDepthMultiplier);
+  }
+  assert(output_depth == input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const int8_t *filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+  {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclamped = 0;
+    int out_x_loop_end_unclamped = 0;
+    if (kAllowStrided)
+    {
+      if (stride == 2)
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 1) / 2;
+        out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
+      }
+      else if (stride == 4)
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 3) / 4;
+        out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
+      }
+      else
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclamped =
+          (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
+      }
+    }
+    else
+    {
+      out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclamped = pad_width + input_width - dilation_factor * filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclamped);
+    const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclamped);
+
+    int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const int8_t *input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
+      num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
+      input_ptr_increment, filter_base_ptr, acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
+inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth,
+                                                  int input_width, const int8_t *input_data,
+                                                  int16_t input_offset, int pad_width,
+                                                  int depth_multiplier, int filter_width,
+                                                  const int8_t *filter_data, int out_x_buffer_start,
+                                                  int out_x_buffer_end, int output_depth,
+                                                  int32_t *acc_buffer)
+{
+  const int8_t *filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+  {
+    const int out_x_loop_start =
+      std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+    const int out_x_loop_end =
+      std::min(out_x_buffer_end,
+               (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
+
+    int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const int8_t *input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
+    {
+      const int8_t *filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic)
+      {
+        const int16_t input_val = *input_ptr++ + input_offset;
+        for (int m = 0; m < depth_multiplier; m++)
+        {
+          const int16_t filter_val = *filter_ptr++;
+          *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const int32_t *bias_data, int32_t *acc_buffer)
+{
+  int i = 0;
+#ifdef USE_NEON
+  if (output_depth == 1)
+  {
+    const int32x4_t b = vdupq_n_s32(bias_data[0]);
+    for (; i <= num_output_pixels - 16; i += 16)
+    {
+      vst1q_s32(acc_buffer + i + 0, b);
+      vst1q_s32(acc_buffer + i + 4, b);
+      vst1q_s32(acc_buffer + i + 8, b);
+      vst1q_s32(acc_buffer + i + 12, b);
+    }
+    for (; i <= num_output_pixels - 4; i += 4)
+    {
+      vst1q_s32(acc_buffer + i, b);
+    }
+  }
+  else if (output_depth == 2)
+  {
+    int32x4_t b = vdupq_n_s32(bias_data[0]);
+    b = vsetq_lane_s32(bias_data[1], b, 1);
+    b = vsetq_lane_s32(bias_data[1], b, 3);
+    for (; i <= num_output_pixels - 8; i += 8)
+    {
+      vst1q_s32(acc_buffer + 2 * i + 0, b);
+      vst1q_s32(acc_buffer + 2 * i + 4, b);
+      vst1q_s32(acc_buffer + 2 * i + 8, b);
+      vst1q_s32(acc_buffer + 2 * i + 12, b);
+    }
+    for (; i <= num_output_pixels - 2; i += 2)
+    {
+      vst1q_s32(acc_buffer + 2 * i, b);
+    }
+  }
+  else if (output_depth == 4)
+  {
+    const int32x4_t b = vld1q_s32(bias_data);
+    for (; i <= num_output_pixels - 4; i += 4)
+    {
+      vst1q_s32(acc_buffer + 4 * i + 0, b);
+      vst1q_s32(acc_buffer + 4 * i + 4, b);
+      vst1q_s32(acc_buffer + 4 * i + 8, b);
+      vst1q_s32(acc_buffer + 4 * i + 12, b);
+    }
+    for (; i < num_output_pixels; i++)
+    {
+      vst1q_s32(acc_buffer + 4 * i, b);
+    }
+  }
+  else if (output_depth == 8)
+  {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    for (; i <= num_output_pixels - 2; i += 2)
+    {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+      vst1q_s32(acc_buffer + 8 * i + 8, b0);
+      vst1q_s32(acc_buffer + 8 * i + 12, b1);
+    }
+    for (; i < num_output_pixels; i++)
+    {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+    }
+  }
+  else if (output_depth == 16)
+  {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    const int32x4_t b2 = vld1q_s32(bias_data + 8);
+    const int32x4_t b3 = vld1q_s32(bias_data + 12);
+    for (; i < num_output_pixels; i++)
+    {
+      vst1q_s32(acc_buffer + 16 * i + 0, b0);
+      vst1q_s32(acc_buffer + 16 * i + 4, b1);
+      vst1q_s32(acc_buffer + 16 * i + 8, b2);
+      vst1q_s32(acc_buffer + 16 * i + 12, b3);
+    }
+  }
+#endif
+  for (; i < num_output_pixels; i++)
+  {
+    memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+inline void DepthwiseConvGeneral(const DepthwiseConvParams &params,
+                                 const int32_t *output_multiplier, const int32_t *output_shift,
+                                 const Shape &input_shape, const int8_t *input_data,
+                                 const Shape &filter_shape, const int8_t *filter_data,
+                                 const Shape & /* bias_shape */, const int32_t *bias_data,
+                                 const Shape &output_shape, int8_t *output_data, int thread_start,
+                                 int thread_end, int thread_dim)
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_rows = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  static const int kAccBufferMaxSize = 2048;
+  int32_t acc_buffer[kAccBufferMaxSize];
+  assert(kAccBufferMaxSize >= output_depth);
+  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  UNUSED_RELEASE(kAccBufferActualSize);
+  assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
+  assert(kAccBufferActualSize <= kAccBufferMaxSize);
+  assert(kOutputPixelsInAccBuffer >= 1);
+  assert(thread_dim == 0 || thread_dim == 1);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
+  row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
+  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&                                  \
+      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&                             \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER)                                                 \
+  {                                                                                               \
+    row_accum_func =                                                                              \
+      QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>;   \
+  }
+
+#ifdef USE_NEON
+  // We go over our list of kernels by decreasing order of preference
+  // for the cases where multiple kernels could apply.
+
+  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
+
+  // Next come the strided kernels: AllowStrided=true, fixed input depth.
+  // They are a bit less efficient, but allow stride!=1.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+  // Finally, the kernels allowing a variable input depth,
+  // these are the least efficient but most general kernels.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
+#endif // USE_NEON
+
+  // No matching fast kernel found, use slow fallback.
+  if (!row_accum_func)
+  {
+    row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
+  }
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
+  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
+  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
+
+  // Now that we have determined row_accum_func, we can start work.
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = output_rows;
+  int output_ptr_offset = 0;
+
+  switch (thread_dim)
+  {
+    case 0:
+      assert(thread_start >= 0);
+      assert(thread_end <= batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
+      break;
+    case 1:
+      assert(thread_start >= 0);
+      assert(thread_end <= output_rows);
+      row_start = thread_start;
+      row_end = thread_end;
+      output_ptr_offset = row_start * output_width * output_depth;
+      break;
+  }
+
+  int8_t *output_ptr = output_data + output_ptr_offset;
+  int batch_step = (output_rows + row_start - row_end) * output_width * output_depth;
+  for (int b = batch_start; b < batch_end; ++b)
+  {
+    for (int out_y = row_start; out_y < row_end; ++out_y)
+    {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      const int filter_y_start =
+        std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
+      const int filter_y_end =
+        std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
+                                  dilation_height_factor);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer)
+      {
+        const int out_x_buffer_end =
+          std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+        {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
+                         input_data + in_y * input_height_stride + b * input_batch_stride,
+                         input_offset, pad_width, depth_multiplier, filter_width,
+                         filter_data + filter_y * filter_height_stride, out_x_buffer_start,
+                         out_x_buffer_end, output_depth, acc_buffer);
+        }
+        // Finished accumulating int32_t values. Now need to convert them to
+        // the final 8bit form and store them.
+        const int num_output_values = output_depth * num_output_pixels;
+
+        Quantize(output_multiplier, output_shift, output_depth, num_output_values, output_offset,
+                 output_activation_min, output_activation_max, acc_buffer, output_ptr);
+
+        output_ptr += num_output_values;
+      }
+    }
+    output_ptr += batch_step;
+  }
+}
+
+} // namespace depthwise_conv
+
+template <DepthwiseConvOutputRounding kOutputRounding>
+inline void DepthwiseConvWithRounding(const DepthwiseConvParams &params,
+                                      const int32_t *output_multiplier, const int32_t *output_shift,
+                                      const Shape &input_shape, const int8_t *input_data,
+                                      const Shape &filter_shape, const int8_t *filter_data,
+                                      const Shape &bias_shape, const int32_t *bias_data,
+                                      const Shape &output_shape, int8_t *output_data,
+                                      int thread_start, int thread_end, int thread_dim)
+{
+  const int depth_multiplier = params.depth_multiplier;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  UNUSED_RELEASE(depth_multiplier);
+  UNUSED_RELEASE(dilation_width_factor);
+  UNUSED_RELEASE(dilation_height_factor);
+  assert(dilation_width_factor >= 1);
+  assert(dilation_height_factor >= 1);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_depth = input_shape.Dims(3);
+  UNUSED_RELEASE(output_depth);
+  UNUSED_RELEASE(input_depth);
+  assert(output_depth == input_depth * depth_multiplier);
+  assert(bias_shape.FlatSize() == output_depth);
+
+//  TODO Use below codes
+#if 0
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+#if defined(__ANDROID__) && defined(__clang__)
+  CpuFlags cpu_flags;
+  GetCpuFlags(&cpu_flags);
+  const bool has_dot_product_instructions = cpu_flags.neon_dotprod;
+
+  // Dispatch to dot-product 3x3 kernels when supported.
+  if (has_dot_product_instructions)
+  {
+    using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
+    DotProduct3x3KernelType kernel_type = optimized_ops::depthwise_conv::CategorizeDotProductKernel<
+      optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
+      input_shape, filter_shape, output_shape, params, output_shift);
+    if (kernel_type != DotProduct3x3KernelType::kNone)
+    {
+      DepthwiseConvParams params_copy = params;
+      params_copy.output_shift_per_channel = output_shift;
+      params_copy.output_multiplier_per_channel = output_multiplier;
+      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3PerChannel<
+        DepthwiseConvImplementation::kUseNeon3x3DotProduct>(
+        params_copy, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
+        output_shape, output_data, thread_start, thread_end, thread_dim);
+      return;
+    }
+  }
+
+#endif
+  // Dispatch to non-dot-product 3x3 kernels when supported.
+
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+
+  // Call kernel optimized for depthwise convolutions using 3x3 filters if
+  // parameters are supported.
+  if (optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
+        optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
+        input_shape, filter_shape, stride_width, stride_height, dilation_width_factor,
+        dilation_height_factor, pad_width, pad_height, depth_multiplier, output_shape, 0,
+        output_shift))
+  {
+    optimized_ops::depthwise_conv::DepthwiseConv3x3FilterPerChannel<
+      DepthwiseConvOutputRounding::kUpward>(
+      params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
+      bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
+    return;
+  }
+#endif
+
+#endif /* end of if 0 */
+
+  depthwise_conv::DepthwiseConvGeneral(
+    params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
+    bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
+}
+
+inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const int32_t *output_multiplier,
+                              const int32_t *output_shift, const Shape &input_shape,
+                              const int8_t *input_data, const Shape &filter_shape,
+                              const int8_t *filter_data, const Shape &bias_shape,
+                              const int32_t *bias_data, const Shape &output_shape,
+                              int8_t *output_data, int thread_start, int thread_end, int thread_dim)
+{
+  return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
+    params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
+    bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
+}
+
+template <typename T, typename TS> struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task
+{
+  DepthwiseConvWorkerTask(const DepthwiseConvParams &params, const int32_t *output_multiplier,
+                          const int32_t *output_shift, const Shape &input_shape,
+                          const T *input_data, const Shape &filter_shape, const T *filter_data,
+                          const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
+                          T *output_data, int thread_start, int thread_end, int thread_dim)
+    : params_(params), output_multiplier_(output_multiplier), output_shift_(output_shift),
+      input_shape_(input_shape), input_data_(input_data), filter_shape_(filter_shape),
+      filter_data_(filter_data), bias_shape_(bias_shape), bias_data_(bias_data),
+      output_shape_(output_shape), output_data_(output_data), thread_start_(thread_start),
+      thread_end_(thread_end), thread_dim_(thread_dim)
+  {
+  }
+
+  void Run() override
+  {
+    DepthwiseConvImpl(params_, output_multiplier_, output_shift_, input_shape_, input_data_,
+                      filter_shape_, filter_data_, bias_shape_, bias_data_, output_shape_,
+                      output_data_, thread_start_, thread_end_, thread_dim_);
+  }
+
+private:
+  const DepthwiseConvParams &params_;
+  const int32_t *output_multiplier_;
+  const int32_t *output_shift_;
+  const Shape &input_shape_;
+  const T *input_data_;
+  const Shape &filter_shape_;
+  const T *filter_data_;
+  const Shape &bias_shape_;
+  const TS *bias_data_;
+  const Shape &output_shape_;
+  T *output_data_;
+  int thread_start_;
+  int thread_end_;
+  int thread_dim_;
+};
+
+inline int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape, int thread_dim)
+{
+  constexpr int kMinMulPerThread = 8;
+  const int output_units = output_shape.Dims(thread_dim);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int num_mul_per_unit =
+    FlatSizeSkipDim(output_shape, thread_dim) * filter_height * filter_width;
+  const int min_units_per_thread = kMinMulPerThread / num_mul_per_unit + 1;
+  int thread_count = output_units / min_units_per_thread;
+  return thread_count;
+}
+
+inline void DepthwiseConvPerChannel(const DepthwiseConvParams &params,
+                                    const int32_t *output_multiplier, const int32_t *output_shift,
+                                    const Shape &input_shape, const int8_t *input_data,
+                                    const Shape &filter_shape, const int8_t *filter_data,
+                                    const Shape &bias_shape, const int32_t *bias_data,
+                                    const Shape &output_shape, int8_t *output_data,
+                                    ruy::Context *ruy_context)
+{
+  UNUSED_ALL(params, output_multiplier, output_shift, input_shape, input_data, filter_shape,
+             filter_data, bias_shape, bias_data, output_shape, output_data, ruy_context);
+
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  const int output_batches = output_shape.Dims(0);
+  const int output_rows = output_shape.Dims(1);
+  int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
+  int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
+  int thread_dim, thread_count, thread_dim_size;
+  if (thread_count_batch > thread_count_row)
+  {
+    thread_dim = 0;
+    thread_dim_size = output_batches;
+    thread_count = thread_count_batch;
+  }
+  else
+  {
+    thread_dim = 1;
+    thread_dim_size = output_rows;
+    thread_count = thread_count_row;
+  }
+
+  // NOTE Borrow RuyContext to get max_num_threads setting
+  // TODO Define and use max_num_threads for CPU backend
+  const int max_threads = ruy_context->max_num_threads();
+  thread_count = std::max(1, std::min(thread_count, max_threads));
+
+  if (thread_count == 1)
+  {
+    DepthwiseConvImpl(params, output_multiplier, output_shift, input_shape, input_data,
+                      filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data,
+                      /*thread_start=*/0,
+                      /*thread_end=*/output_rows, /*thread_dim=*/1);
+  }
+  else
+  {
+    std::vector<DepthwiseConvWorkerTask<int8_t, int32_t>> tasks;
+    // TODO(b/131746020) don't create new heap allocations every time.
+    // At least we make it a single heap allocation by using reserve().
+    tasks.reserve(thread_count);
+    int thread_start = 0;
+    for (int i = 0; i < thread_count; ++i)
+    {
+      int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i);
+      tasks.emplace_back(params, output_multiplier, output_shift, input_shape, input_data,
+                         filter_shape, filter_data, bias_shape, bias_data, output_shape,
+                         output_data, thread_start, thread_end, thread_dim);
+      thread_start = thread_end;
+    }
+    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context);
+  }
+}
+
+} // namespace optimized_integer_ops
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__
diff --git a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
index 93cb21e0b..96e1d9127 100644
--- a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
@@ -62,7 +62,7 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shap
 }
 
 template <typename T>
-inline void BroadcastBinaryArithmeticOpSlowQuant8(
+inline typename std::enable_if_t<is_quant8<T>::value> BroadcastBinaryArithmeticOpSlow(
   const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
   const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data,
   const std::function<T(const BinaryArithmeticOpParam &params, const T &, const T &)> &fn)
@@ -72,11 +72,6 @@ inline void BroadcastBinaryArithmeticOpSlowQuant8(
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
   const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape);
 
-  if ((params.quantized_activation_min < 0) && (params.quantized_activation_max > 255))
-  {
-    throw std::runtime_error{"Support only for Quant8."};
-  }
-
   // Comment from tensorflow lite:
   //
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
@@ -98,11 +93,10 @@ inline void BroadcastBinaryArithmeticOpSlowQuant8(
       {
         for (int c = 0; c < extended_output_shape.Dims(3); ++c)
         {
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-            ActivationFunctionWithMinMax<uint8_t>(
-              fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-                 input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
-              params.quantized_activation_min, params.quantized_activation_max);
+          output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax<T>(
+            fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+               input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+            params.quantized_activation_min, params.quantized_activation_max);
         }
       }
     }
diff --git a/compute/cker/include/cker/operation/reference/Conv.h b/compute/cker/include/cker/operation/reference/Conv.h
index 43a5bf256..4474754af 100644
--- a/compute/cker/include/cker/operation/reference/Conv.h
+++ b/compute/cker/include/cker/operation/reference/Conv.h
@@ -190,6 +190,116 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
   }
 }
 
+inline void Conv(const ConvParams &params, const int32_t *output_multiplier,
+                 const int32_t *output_shift, const Shape &input_shape, const int8_t *input_data,
+                 const Shape &filter_shape, const int8_t *filter_data, const Shape &bias_shape,
+                 const int32_t *bias_data, const Shape &output_shape, int8_t *output_data)
+{
+  UNUSED_RELEASE(bias_shape);
+  // Get parameters.
+  const int32_t input_offset = params.input_offset; // r = s(q - Z)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t output_offset = params.output_offset;
+
+  // Set min and max value of the output.
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Consistency check.
+  assert(output_activation_min < output_activation_max);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data)
+  {
+    assert(bias_shape.FlatSize() == output_depth);
+  }
+
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+        {
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
+
+              if (!is_point_inside_image)
+              {
+                continue;
+              }
+
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+              {
+                int32_t input_val = input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
+                int32_t filter_val =
+                  filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                // Accumulate with 32 bits accumulator.
+                // In the nudging process during model quantization, we force
+                // real value of 0.0 be represented by a quantized value. This
+                // guarantees that the input_offset is a int8_t, even though
+                // it is represented using int32_t. int32_t += int8_t *
+                // (int8_t - int8_t) so the highest value we can get from each
+                // accumulation is [-127, 127] * ([-128, 127] -
+                // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                // = 14.98, which means we can accumulate at least 2^16
+                // multiplications without overflow. The accumulator is
+                // applied to a filter so the accumulation logic will hold as
+                // long as the filter size (filter_y * filter_x * in_channel)
+                // does not exceed 2^16, which is the case in all the models
+                // we have seen so far.
+                // TODO(jianlijianli): Add a check to make sure the
+                // accumulator depth is smaller than 2^16.
+                acc += filter_val * (input_val + input_offset);
+              }
+            }
+          }
+
+          if (bias_data)
+          {
+            acc += bias_data[out_channel];
+          }
+          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier[out_channel],
+                                              output_shift[out_channel]);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+            static_cast<int8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
 } // namespace reference
 } // namespace cker
 } // namespace nnfw
author	Chunseok Lee <chunseok.lee@samsung.com>	2021-04-20 18:01:41 +0900
committer	Chunseok Lee <chunseok.lee@samsung.com>	2021-04-20 18:01:41 +0900
commit	589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e (patch)
tree	47a2b23ce4220e3a4150c8b12ed941555272fb0c /compute
parent	62529acabbafce7730601ed01d5709d7bc0d378a (diff)
download	nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.tar.gz nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.tar.bz2 nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.zip