Imported Upstream version 1.4.0upstream/1.4.0 submit/tizen/20200423.054851

author: Chunseok Lee <chunseok.lee@samsung.com> 2020-04-23 14:45:49 +0900
committer: Chunseok Lee <chunseok.lee@samsung.com> 2020-04-23 14:45:49 +0900
commit: e2ef8438a24f7c56a0744eb579a6e293ee2fbf8e (patch)
tree: 44a1a7951d168dd4370e13593ed03f4bc6d920c5 /compute/ARMComputeEx/src
parent: 302e6564a7a76109e1178207e44e45a58631c477 (diff)
download: nnfw-e2ef8438a24f7c56a0744eb579a6e293ee2fbf8e.tar.gz
nnfw-e2ef8438a24f7c56a0744eb579a6e293ee2fbf8e.tar.bz2
nnfw-e2ef8438a24f7c56a0744eb579a6e293ee2fbf8e.zip
107 files changed, 5647 insertions, 278 deletions
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 7d4760600..191a5bc2a 100644
--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 
@@ -53,13 +69,16 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"gather_ex", "gather_ex.cl"},
     {"gather_ex_1d", "gather_ex.cl"},
     {"gather_ex_1d_out", "gather_ex.cl"},
+    {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"},
     {"hashtable_lookup", "hashtable_lookup.cl"},
     {"instance_normalization_ex", "instance_normalization_ex.cl"},
+    {"multiply_scale_factor", "multiply_scale_factor.cl"},
     {"neg_tensor", "neg_tensor.cl"},
     {"permute_generic", "permute_ex.cl"},
     {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"},
     {"prelu", "prelu.cl"},
     {"prelu_qasymm8", "prelu_quantized.cl"},
+    {"quantization_symm8", "quantization_symm8.cl"},
     {"reduce_min_max", "reduce_operation.cl"},
     {"reduce_sum_mean", "reduce_operation.cl"},
     {"topkv2_init", "topkv2.cl"},
@@ -71,6 +90,7 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
     {"radixsort_reorder", "topkv2_radixsort.cl"},
     {"topkv2_quicksort", "topkv2_quicksort.cl"},
+    {"scale_factor_symm8", "scale_factor.cl"},
     {"space_to_batch_4d_nchw", "space_to_batch.cl"},
     {"space_to_batch_4d_nhwc", "space_to_batch.cl"},
     {"space_to_depth_nchw", "space_to_depth.cl"},
@@ -100,6 +120,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/gather_ex.clembed"
     },
     {
+        "gemmlowp_ex.cl",
+#include "./cl_kernels/gemmlowp_ex.clembed"
+    },
+    {
         "hashtable_lookup.cl",
 #include "./cl_kernels/hashtable_lookup.clembed"
     },
@@ -120,6 +144,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/binary_logical_op.clembed"
     },
     {
+        "multiply_scale_factor.cl",
+#include "./cl_kernels/multiply_scale_factor.clembed"
+    },
+    {
         "neg_tensor.cl",
 #include "./cl_kernels/neg_tensor.clembed"
     },
@@ -132,10 +160,18 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/prelu_quantized.clembed"
     },
     {
+        "quantization_symm8.cl",
+#include "./cl_kernels/quantization_symm8.clembed"
+    },
+    {
         "reduce_operation.cl",
 #include "./cl_kernels/reduce_operation.clembed"
     },
     {
+        "scale_factor.cl",
+#include "./cl_kernels/scale_factor.clembed"
+    },
+    {
         "space_to_batch.cl",
 #include "./cl_kernels/space_to_batch.clembed"
     },
@@ -180,7 +216,7 @@ Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name,
 
   if (_kernel_program_map.end() == kernel_program_it)
   {
-    ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
+    ARM_COMPUTE_ERROR_VAR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
   }
   std::string concat_str;
 
@@ -261,7 +297,7 @@ const Program &CLKernelLibraryEx::load_program(const std::string &program_name)
 
   if (_program_source_map.end() == program_source_it)
   {
-    ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+    ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str());
   }
 
   program = Program(_context, program_name, program_source_it->second);
@@ -282,7 +318,7 @@ const Program &CLKernelLibraryEx::load_program(const std::string &program_name)
   }
   else
   {
-    ARM_COMPUTE_ERROR("Kernel file %s does not exist.", source_name.c_str());
+    ARM_COMPUTE_ERROR_VAR("Kernel file %s does not exist.", source_name.c_str());
   }
 #endif /* EMBEDDED_KERNELS */
 
@@ -315,7 +351,7 @@ std::string CLKernelLibraryEx::get_program_source(const std::string &program_nam
 
   if (program_source_it == _program_source_map.end())
   {
-    ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+    ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str());
   }
 
   return program_source_it->second;
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
index 2a6dfc91f..03717cfe9 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "helpers.h"
 
 #if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
index 77e239f55..f74c1c103 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "helpers_asymm.h"
 
 #ifdef SATURATE
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
index 8c875516d..e249663bc 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "helpers.h"
 
 #ifndef VEC_SIZE
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
index 2342fda9f..4147a0017 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "helpers.h"
 
 #ifndef SCALE
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
index e005322f7..0285c955b 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "helpers.h"
 
 #if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
index dd8cb6d93..92e5dfbee 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "helpers.h"
 
 #ifndef VEC_SIZE
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl
index 09f776156..2236021f1 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "helpers.h"
 
 #if defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl
new file mode 100644
index 000000000..80ba73d1d
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && \
+    defined(COLS_A)
+#define VECTOR_CHAR VEC_DATA_TYPE(char, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+#define VECTOR_FLOAT VEC_DATA_TYPE(float, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B
+ * (src1) in case both matrices have not beed reshaped
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data type:
+ * QASYMM8
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data type:
+ * same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+                                     IMAGE_DECLARATION(dst), uint src0_stride_z, uint src1_stride_z,
+                                     uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                     ,
+                                     uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                     ,
+                                     uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                     )
+{
+  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+  // Compute starting address for matrix A and Matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+  // Update address for the matrix A
+  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+  // Update address for the matrix B
+  src_addr.s1 += idx;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+  // the z dimension
+  // in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+              (uint4)HEIGHT_GEMM3D;
+  zin = min(DEPTH_GEMM3D - 1, zin);
+
+  // Add offset due to the cross plane paddings
+  zin *= (src_cross_plane_pad * src0_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src0_stride_z by DEPTH_GEMM3D
+  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  int end_row_vec_a = src_addr.s0 + COLS_A;
+
+  VECTOR_INT acc0 = 0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  VECTOR_INT acc1 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  VECTOR_INT acc2 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  VECTOR_INT acc3 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+  VECTOR_INT acc4 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+
+  for (; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
+  {
+    // Load values from matrix A
+    char2 a0 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    char2 a1 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    char2 a2 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    char2 a3 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    char2 a4 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 4 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    // Load values from matrix B
+    VECTOR_CHAR b0 =
+        VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
+    VECTOR_CHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(
+        0, (__global char *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+    // Accumulate
+    acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0.s0;
+    acc0 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a0.s1;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a1.s0;
+    acc1 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a1.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a2.s0;
+    acc2 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a2.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a3.s0;
+    acc3 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a3.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    acc4 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a4.s0;
+    acc4 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a4.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+  }
+
+  for (; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
+  {
+    // Load values from matrix A
+    char a0 = *(__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    char a1 = *(__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    char a2 = *(__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    char a3 = *(__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    char a4 = *(__global char *)(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    // Load values from matrix B
+    VECTOR_CHAR b0 =
+        VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
+
+    // Accumulate
+    acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a2;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    acc4 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a4;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+  }
+
+  const int z = get_global_id(2);
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension
+  // in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) +
+                (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+               (uint8)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (dst_cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
+
+  // Store the result
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst.ptr += z * dst_stride_z;
+
+  // Store the result
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+  VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+  (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+}
+#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) &&
+       // defined(COLS_A)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
index 73f29e3e5..a4f7dbd48 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "helpers.h"
 
 #ifndef VEC_SIZE
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
index 0e123ae0a..2d0b6a299 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
@@ -1,4 +1,20 @@
 /*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -21,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #ifndef ARM_COMPUTE_HELPER_H
 #define ARM_COMPUTE_HELPER_H
 
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
index c39138caa..a83b1a8a5 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
@@ -1,4 +1,20 @@
 /*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -21,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #ifndef ARM_COMPUTE_HELPERS_ASYMM_H
 #define ARM_COMPUTE_HELPERS_ASYMM_H
 
@@ -403,4 +420,4 @@ ASYMM_RESCALE_IMPL(4)
 ASYMM_RESCALE_IMPL(8)
 ASYMM_RESCALE_IMPL(16)
 
-#endif // ARM_COMPUTE_HELPERS_ASYMM_H
-\ No newline at end of file
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
index 1d96150f8..014842680 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "helpers.h"
 
 #if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl
new file mode 100644
index 000000000..3943fc4c2
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE)
+
+/** This performs to multiply input by scale_factor.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ * -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data
+ * types: S8
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ * tensor
+ * @param[in]  scale_ptr                            Pointer to the source tensor. Supported data
+ * types: S32
+ * @param[in]  scale_stride_x                       Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  scale_step_x                         scale_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  scale_offset_first_element_in_bytes  The offset of the first element in the scale
+ * tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported
+ * data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ */
+__kernel void multiply_scale_factor(IMAGE_DECLARATION(input), VECTOR_DECLARATION(scale),
+                                    IMAGE_DECLARATION(output), float multiplier)
+{
+  // Get pixels pointer
+  Image input = CONVERT_TO_IMAGE_STRUCT(input);
+  Image output = CONVERT_TO_IMAGE_STRUCT(output);
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+  // Check if access on width gets out of bounds
+  // If it does shift access vector to access elements within bounds
+  const int xi = (int)(get_global_id(0) * VEC_SIZE);
+  input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+  output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+  // Load data
+  VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+  val = CONVERT(VLOAD(VEC_SIZE)(0, (__global int *)input.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+
+  // Create scale vector
+  VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+  vscale = *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1));
+
+  // Dequantize
+  vscale *= (DATA_TYPE)(multiplier);
+  val *= vscale;
+
+  // Store result
+  VSTORE(VEC_SIZE)
+  (val, 0, (__global DATA_TYPE *)output.ptr);
+#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+  *((__global DATA_TYPE *)(output.ptr)) =
+      ((DATA_TYPE)(*((__global int *)(input.ptr)))) *
+      *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)) * (DATA_TYPE)(multiplier);
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
+
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
index 4aa7883c3..15c16f80c 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "helpers.h"
 
 #ifndef VEC_SIZE
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
index 2074d3ceb..76fda9041 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "helpers_asymm.h"
 
 #ifdef SATURATE
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
index 62a8901f6..12c8eeb79 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "helpers.h"
 
 #ifndef VEC_SIZE
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
index 5e0abd585..a66e107d1 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "helpers.h"
 #define SUB(x, y) (x) - (y)
 
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl
new file mode 100644
index 000000000..4ae9adb0b
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_RTE_VEC_STR(x, type, size) (convert_##type##size##_rte((x)))
+#define CONVERT_RTE_VEC(x, type, size) CONVERT_RTE_VEC_STR(x, type, size)
+#define MIN_QUANT_VAL -127
+#define MAX_QUANT_VAL 127
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
+
+/** This performs the quantization of floating point inputs to 8-bit unsigned integers.
+ *
+ * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE_IN=type. e.g.
+ * -DDATA_TYPE=short
+ * @note Output data type should be given as a preprocessor argument using -DDATA_TYPE_OUT=type.
+ * e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
+ * -DVEC_SIZE=16
+ * @note Quantization scale should be given as a preprocessor argument using -DSCALE=scale. e.g.
+ * -DSCALE=0.125
+ * @note Quantization offset should be given as a preprocessor argument using -DOFFSET=offset. e.g.
+ * -DOFFSET=125
+ * @note Minimum value for quantized type should be given as a preprocessor argument using
+ * -DMIN_QUANT_VAL=value. e.g. -DMIN_QUANT_VAL=0
+ * @note Maximum value for quantized type should be given as a preprocessor argument using
+ * -DMAX_QUANT_VAL=value. e.g. -DMAXIN_QUANT_VAL=255
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data
+ * types: F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
+ * tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported
+ * data types: S8
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ * @param[out] scale_ptr                            Pointer to the scale tensor. Supported data
+ * types: F32
+ * @param[in]  scale_stride_x                       Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  scale_step_x                         scale_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ */
+__kernel void quantization_symm8(IMAGE_DECLARATION(input), VECTOR_DECLARATION(scale),
+                                 IMAGE_DECLARATION(output))
+{
+  // Get pixels pointer
+  Image input = CONVERT_TO_IMAGE_STRUCT(input);
+  Image output = CONVERT_TO_IMAGE_STRUCT(output);
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+  // Check if access on width gets out of bounds
+  // If it does shift access vector to access elements within bounds
+  const int xi = (int)(get_global_id(0) * VEC_SIZE);
+  input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+  output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+  // Load data
+  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+  val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+
+  // Create scale vector
+  const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale =
+      *(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1));
+
+  // Quantize
+  VEC_DATA_TYPE(int, VEC_SIZE)
+  res = CLAMP(CONVERT_RTE_VEC(val / vscale, int, VEC_SIZE), MIN_QUANT_VAL, MAX_QUANT_VAL);
+
+  // Store result
+  VSTORE(VEC_SIZE)
+  (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
+#else  //! defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+  *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP(
+      CONVERT_RTE((*(__global DATA_TYPE_IN *)input.ptr) /
+                      (*(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1))),
+                  int),
+      MIN_QUANT_VAL, MAX_QUANT_VAL);
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
index d7ea2e2c4..832ac1270 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "helpers.h"
 
 #if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/scale_factor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/scale_factor.cl
new file mode 100644
index 000000000..3d5e90356
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/scale_factor.cl
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#if defined(WIDTH)
+/** This function identifies the min and maximum value of an input 3D tensor.
+ *
+ * @note The width, height and depth of the input tensor must be provided at compile time using
+ * -DWIDTH, -DHEIGHT and -DDEPTH (e.g. -DWIDTH=320, -DHEIGHT=240, -DDEPTH=3)
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types:
+ * F32
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] dst_ptr                           Pointer to the min/max vector. Minimum value in
+ * position 0, maximum value in position 1. Supported data types: F32.
+ * @param[in] dst_stride_x                      Stride of the min/max vector in X dimension (in
+ * bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the min/max
+ * vector
+ */
+__kernel void scale_factor_symm8(IMAGE_DECLARATION(src), VECTOR_DECLARATION(dst))
+{
+  Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+  float4 min_value = (float4)FLT_MAX;
+  float4 max_value = (float4)-FLT_MAX;
+
+  int x = 0;
+  __global float *src_addr = (__global float *)(src.ptr);
+
+  for (; x <= (int)(WIDTH - 8); x += 8)
+  {
+    float8 value = vload8(0, (__global float *)(src_addr + x));
+
+    min_value = select(value.s0123, min_value, min_value < value.s0123);
+    min_value = select(value.s4567, min_value, min_value < value.s4567);
+
+    max_value = select(value.s0123, max_value, max_value > value.s0123);
+    max_value = select(value.s4567, max_value, max_value > value.s4567);
+  }
+
+  for (; x < WIDTH; ++x)
+  {
+    float value = *(src_addr + x);
+
+    min_value.s0 = min(min_value.s0, value);
+    max_value.s0 = max(max_value.s0, value);
+  }
+
+  // Perform min/max reduction
+  min_value.s01 = min(min_value.s01, min_value.s23);
+  min_value.s0 = min(min_value.s0, min_value.s1);
+  max_value.s01 = max(max_value.s01, max_value.s23);
+  max_value.s0 = max(max_value.s0, max_value.s1);
+
+  // Extract scale
+  max_value.s0 = max(fabs(min_value.s0), fabs(max_value.s0)) / 127.0f;
+
+  // Store min and max
+  *((__global float *)(dst_ptr) + get_global_id(1)) = max_value.s0;
+}
+#endif // defined(WIDTH)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
index 7367da7fb..b1611043b 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "helpers.h"
 
 #if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && \
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
index a26e762e8..eb612f834 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016, 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "helpers.h"
 
 #if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
index 50472e4f9..3eb1a4ce7 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +14,30 @@
  * limitations under the License.
  */
 
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "helpers.h"
 
 __kernel void topkv2_init(VECTOR_DECLARATION(input), __global float *in_key_buf,
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
index 9594daf19..460de790b 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +14,30 @@
  * limitations under the License.
  */
 
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "helpers.h"
 
 __global inline float *get_vec_elem(Vector *vec, int idx)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
index f6830d229..e9d4696b4 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +14,30 @@
  * limitations under the License.
  */
 
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 // reference:
 // https://code.google.com/archive/p/ocl-radix-sort/source/default/source
 // OpenCL kernel sources for the CLRadixSort class
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
index 7f4b5b0df..06eeb5b98 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
index c14e73634..bb5556888 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
index 35f607bd0..01ea655b4 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/core/CL/kernels/CLCastKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
@@ -52,8 +76,9 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output, SubDataT
   // Create kernel
   if (is_data_type_quantized_asymmetric(input->info()->data_type()))
   {
-    const float scale_in = input->info()->quantization_info().scale;
-    const int offset_in = input->info()->quantization_info().offset;
+    UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
+    const float scale_in = qinfo.scale;
+    const int offset_in = qinfo.offset;
     build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
     build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
 
@@ -62,8 +87,10 @@ void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output, SubDataT
   }
   else if (is_data_type_quantized_asymmetric(output->info()->data_type()))
   {
-    const float scale_in = output->info()->quantization_info().scale;
-    const int offset_in = output->info()->quantization_info().offset;
+    UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform();
+    const float scale_in = qinfo.scale;
+    const float offset_in = qinfo.offset;
+
     build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
     build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
index 2a3433c2b..389136817 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
index 0862b78bf..79f5ce065 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp
new file mode 100644
index 000000000..235e8975d
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp
@@ -0,0 +1,372 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1,
+                          const ITensorInfo *output, const GEMMReshapeInfo &gemm_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4,
+                                  "The number of dimensions for the matrix A must be <= 4");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3,
+                                  "The number of dimensions for the matrix B must be <= 3");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 &&
+                                      gemm_info.reinterpret_input_as_3d(),
+                                  "The input1 tensor cannot have more than 2 dimensions if input0 "
+                                  "has to be reinterpreted as 3D");
+
+  const int m = gemm_info.m();
+  const int n = gemm_info.n();
+  const int k = gemm_info.k();
+
+  ARM_COMPUTE_UNUSED(m);
+  ARM_COMPUTE_UNUSED(n);
+  ARM_COMPUTE_UNUSED(k);
+
+  ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
+  ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast<unsigned int>(n));
+  ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast<unsigned int>(k));
+  if (gemm_info.reinterpret_input_as_3d())
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) !=
+                                static_cast<unsigned int>(m));
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
+  }
+
+  if (output->total_size() != 0)
+  {
+    const TensorInfo tensor_info_output =
+        output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+  }
+
+  return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1,
+                                                        ITensorInfo *output,
+                                                        const GEMMReshapeInfo &gemm_info,
+                                                        ElementsProcessed &num_elements_processed)
+{
+  unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+  unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+  bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+
+  Window win{};
+  Window win_out{};
+  bool window_changed = false;
+
+  // In case both input and output have to be reinterpreted as 3D tensors,
+  // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+  if (reinterpret_input_as_3d == reinterpret_output_as_3d)
+  {
+    reinterpret_input_as_3d = false;
+    reinterpret_output_as_3d = false;
+  }
+
+  // Output tensor auto inizialitation if not yet initialized
+  auto_init_if_empty(*output,
+                     input0->clone()
+                         ->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info))
+                         .set_data_type(DataType::S32));
+
+  TensorInfo tmp_info(*output);
+
+  if (reinterpret_output_as_3d)
+  {
+    // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D
+    // GEMM,
+    // the window needs to be constructed on the 2D collapsed version of the tensor
+    TensorShape tmp_shape(output->tensor_shape());
+    tmp_shape.collapse(2U, 1U);
+    tmp_info.set_tensor_shape(tmp_shape);
+  }
+
+  // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
+  // Note: if the dot product instruction is available, the 8x2 tile has to be used
+  num_elems_processed_per_iteration_x = 4;
+  num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
+
+  // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
+  // The only way to set properly the paddings, it is to set those explicitly through the
+  // AccessWindowStatic
+  const int m = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2]
+                                        : input0->tensor_shape()[1];
+  const int bottom_pad =
+      (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) %
+      num_elems_processed_per_iteration_y;
+
+  // Configure window
+  win = calculate_max_window(
+      tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+  win_out = calculate_max_window(
+      *output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+  AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0),
+                                   input0->dimension(1) + bottom_pad);
+  AccessWindowStatic input1_access(
+      input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+      input1->dimension(1));
+  AccessWindowStatic output_access(
+      output, 0, 0, ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
+      output->dimension(1) + bottom_pad);
+
+  window_changed =
+      update_window_and_padding(win, input0_access,
+                                input1_access) || // window used by the execute_window_loop
+      update_window_and_padding(
+          win_out,
+          output_access); // window used to update the padding requirements of output tensor
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->num_dimensions());
+  output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape()));
+
+  // Collapse along the Z direction
+  // This collapse needs to be here in order to tune the Z dimension of LWS
+  Window collapsed = win;
+  const unsigned int dimension_to_collapse =
+      std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
+  collapsed = win.collapse(win, dimension_to_collapse);
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_pair(err, collapsed);
+}
+} // namespace
+
+CLGEMMLowpMatrixMultiplyKernelEx::CLGEMMLowpMatrixMultiplyKernelEx()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true),
+      _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false)
+{
+}
+
+void CLGEMMLowpMatrixMultiplyKernelEx::configure(const ICLTensor *input0, const ICLTensor *input1,
+                                                 ICLTensor *output,
+                                                 const GEMMReshapeInfo &gemm_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input0->info(), input1->info(), output->info(), gemm_info));
+
+  _input0 = input0;
+  _input1 = input1;
+  _output = output;
+  _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+  _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+
+  // In case both input and output have to be reinterpreted as 3D tensors,
+  // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+  if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+  {
+    _reinterpret_input_as_3d = false;
+    _reinterpret_output_as_3d = false;
+  }
+
+  // Check if we need to slide the matrix B
+  const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d
+                                                 ? _input0->info()->num_dimensions() - 1
+                                                 : _input0->info()->num_dimensions();
+  _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
+
+  ElementsProcessed num_elements_processed{};
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(),
+                                                  gemm_info, num_elements_processed);
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+
+  // Create build options
+  std::string kernel_name(" ");
+  CLBuildOptions build_opts;
+  build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+  build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+  build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                           "-DHEIGHT_GEMM3D=" +
+                               support::cpp11::to_string(output->info()->dimension(1)));
+  build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                           "-DDEPTH_GEMM3D=" +
+                               support::cpp11::to_string(output->info()->dimension(2)));
+  build_opts.add_option_if(!_slide_matrix_b,
+                           "-DMATRIX_B_DEPTH=" +
+                               support::cpp11::to_string(input1->info()->dimension(2)));
+  build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
+  build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" +
+                        support::cpp11::to_string(num_elements_processed.x()));
+  build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" +
+                        support::cpp11::to_string(num_elements_processed.y()));
+
+  kernel_name = "gemmlowp_mm_midgard_ex";
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
+
+  // Set config_id for enabling LWS tuning
+  _config_id = kernel_name;
+  _config_id += "_";
+  _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+  _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+  _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(output->info()->dimension(1));
+  _config_id += "_";
+  _config_id += support::cpp11::to_string(output->info()->dimension(0));
+}
+
+Status CLGEMMLowpMatrixMultiplyKernelEx::validate(const ITensorInfo *input0,
+                                                  const ITensorInfo *input1,
+                                                  const ITensorInfo *output,
+                                                  const GEMMReshapeInfo &gemm_info)
+{
+  ElementsProcessed num_elements_processed{};
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      validate_and_configure_window(input0->clone().get(), input1->clone().get(),
+                                    output->clone().get(), gemm_info, num_elements_processed)
+          .first);
+
+  return Status{};
+}
+
+void CLGEMMLowpMatrixMultiplyKernelEx::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  if (_input1->info()->num_dimensions() < 3)
+  {
+    // The stride_z for matrix B must be zero if we do not slice
+    ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+  }
+
+  Window slice = window.first_slice_window_3D();
+  Window slice_matrix_b = slice;
+
+  slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+  slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+  if (_reinterpret_input_as_3d)
+  {
+    // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+    const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
+    const unsigned int total_cross_plane_pad =
+        _input0->info()->padding().top + _input0->info()->padding().bottom;
+    _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+  }
+
+  if (_reinterpret_output_as_3d)
+  {
+    // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+    const unsigned int idx0 =
+        3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+    const unsigned int total_cross_plane_pad =
+        _output->info()->padding().top + _output->info()->padding().bottom;
+    _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+  }
+
+  do
+  {
+    Window slice_b = slice;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A
+    // more than 2
+    // This scenario can happen when the matrix multiplication is used to perform a convolution
+    // operation
+    if (!_slide_matrix_b)
+    {
+      slice_b = slice_matrix_b;
+    }
+
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _input0, slice);
+    add_2D_tensor_argument(idx, _input1, slice_b);
+    add_2D_tensor_argument(idx, _output, slice);
+    _kernel.setArg<cl_uint>(idx++,
+                            static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+    _kernel.setArg<cl_uint>(idx++,
+                            static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+    _kernel.setArg<cl_uint>(idx++,
+                            static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
+    enqueue(queue, *this, slice, lws_hint());
+  } while (window.slide_window_slice_3D(slice));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
index 718f615f9..3a25987d0 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/core/CL/kernels/CLGatherExKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
index 31e98c9a8..7fbdcdaa7 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
index 5db414f62..b45f6bb24 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
new file mode 100644
index 000000000..d305896ea
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor,
+                          const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1));
+  ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+  // Checks performed when output is configured
+  if ((output->total_size() != 0))
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+  }
+
+  return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(const ITensorInfo *input,
+                                                         ITensorInfo *output)
+{
+  // Configure kernel window
+  Window win = calculate_max_window(*input, Steps());
+
+  // Output tensor auto initialization if not yet initialized
+  auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
+
+  // CLMultiplyScaleFactorKernel doesn't need padding so update_window_and_padding() can be
+  // skipped
+  Coordinates coord;
+  coord.set_num_dimensions(output->num_dimensions());
+  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+
+  return std::make_tuple(Status{}, win);
+}
+} // namespace
+
+CLMultiplyScaleFactorKernel::CLMultiplyScaleFactorKernel()
+    : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
+{
+}
+
+void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTensor *scale_factor,
+                                            ICLTensor *output, float multiplier)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), scale_factor->info(), output->info()));
+
+  _input = input;
+  _scale_factor = scale_factor;
+  _output = output;
+  _multiplier = multiplier;
+
+  const int vec_size_x = 16 / output->info()->element_size();
+  const int output_width_x = output->info()->tensor_shape().x();
+  const bool multi_access_x = (output_width_x / vec_size_x > 0);
+
+  // Create and update the window (if needed)
+  Window win = calculate_max_window(*output->info());
+  if (multi_access_x)
+  {
+    win.set(Window::DimX,
+            Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x),
+                              vec_size_x));
+  }
+  ICLKernel::configure_internal(win);
+
+  // Create kernel
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
+  build_opts.add_option_if(
+      multi_access_x, "-DLAST_ACCESSED_X=" +
+                          support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("multiply_scale_factor", build_opts.options()));
+}
+
+Status CLMultiplyScaleFactorKernel::validate(const ITensorInfo *input,
+                                             const ITensorInfo *scale_factor,
+                                             const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+  return Status{};
+}
+
+void CLMultiplyScaleFactorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window slice = window_collapsed.first_slice_window_2D();
+
+  // Set scale_factor window
+  Window win_scale = calculate_max_window(*_scale_factor->info(), Steps());
+
+  do
+  {
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _input, slice);
+    add_1D_tensor_argument(idx, _scale_factor, win_scale);
+    add_2D_tensor_argument(idx, _output, slice);
+    _kernel.setArg<float>(idx++, _multiplier);
+    enqueue(queue, *this, slice, lws_hint());
+  } while (window_collapsed.slide_window_slice_2D(slice));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
index ecfe05a51..74f7b4158 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/core/CL/kernels/CLNegKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
index e7d587029..8910a7b80 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
@@ -72,18 +96,18 @@ void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, IC
 
   if (is_data_type_quantized_asymmetric(input->info()->data_type()))
   {
-    build_opts.emplace("-DOFF_IN=" +
-                       support::cpp11::to_string(input->info()->quantization_info().offset));
-    build_opts.emplace("-DOFF_ALPHA=" +
-                       support::cpp11::to_string(alpha->info()->quantization_info().offset));
-    build_opts.emplace("-DOFF_OUT=" +
-                       support::cpp11::to_string(output->info()->quantization_info().offset));
-    build_opts.emplace("-DSCALE_IN=" +
-                       support::cpp11::to_string(input->info()->quantization_info().scale));
-    build_opts.emplace("-DSCALE_ALPHA=" +
-                       support::cpp11::to_string(alpha->info()->quantization_info().scale));
-    build_opts.emplace("-DSCALE_OUT=" +
-                       support::cpp11::to_string(output->info()->quantization_info().scale));
+    build_opts.emplace("-DOFF_IN=" + support::cpp11::to_string(
+                                         input->info()->quantization_info().uniform().offset));
+    build_opts.emplace("-DOFF_ALPHA=" + support::cpp11::to_string(
+                                            alpha->info()->quantization_info().uniform().offset));
+    build_opts.emplace("-DOFF_OUT=" + support::cpp11::to_string(
+                                          output->info()->quantization_info().uniform().offset));
+    build_opts.emplace("-DSCALE_IN=" + support::cpp11::to_string(
+                                           input->info()->quantization_info().uniform().scale));
+    build_opts.emplace("-DSCALE_ALPHA=" + support::cpp11::to_string(
+                                              alpha->info()->quantization_info().uniform().scale));
+    build_opts.emplace("-DSCALE_OUT=" + support::cpp11::to_string(
+                                            output->info()->quantization_info().uniform().scale));
     kernel_name += "_qasymm8";
   }
   _kernel =
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
new file mode 100644
index 000000000..2d551f654
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor,
+                          const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, scale_factor);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1));
+
+  // Output must always be initialized
+  ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+  return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  // Configure kernel window
+  Window win = calculate_max_window(*input, Steps());
+
+  const int vec_size_x = 16 / input->element_size();
+  const int input_width_x = input->tensor_shape().x();
+  const bool multi_access_x = (input_width_x / vec_size_x > 0);
+
+  if (multi_access_x)
+  {
+    win.set(Window::DimX,
+            Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x),
+                              vec_size_x));
+  }
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->num_dimensions());
+  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+
+  return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLQuantizationSymmetricKernel::CLQuantizationSymmetricKernel()
+    : _input(nullptr), _scale_factor(nullptr), _output(nullptr)
+{
+}
+
+void CLQuantizationSymmetricKernel::configure(const ICLTensor *input, const ICLTensor *scale_factor,
+                                              ICLTensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, scale_factor, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), scale_factor->info(), output->info()));
+
+  _input = input;
+  _scale_factor = scale_factor;
+  _output = output;
+
+  const int vec_size_x = 16 / input->info()->element_size();
+  const int input_width_x = input->info()->tensor_shape().x();
+  const bool multi_access_x = (input_width_x / vec_size_x > 0);
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(input->info(), output->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+
+  // Create kernel
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+  build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.add_option("-DDATA_TYPE_OUT=" +
+                        get_cl_type_from_data_type(output->info()->data_type()));
+  build_opts.add_option_if(
+      multi_access_x, "-DLAST_ACCESSED_X=" +
+                          support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
+
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("quantization_symm8", build_opts.options()));
+}
+
+Status CLQuantizationSymmetricKernel::validate(const ITensorInfo *input,
+                                               const ITensorInfo *scale_factor,
+                                               const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+
+  return Status{};
+}
+
+void CLQuantizationSymmetricKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  // Support only 2D
+  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window slice = window_collapsed.first_slice_window_2D();
+
+  do
+  {
+    Window scale_slice = slice.shift_dimensions(1);
+
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _input, slice);
+    add_1D_tensor_argument(idx, _scale_factor, scale_slice);
+    add_2D_tensor_argument(idx, _output, slice);
+    enqueue(queue, *this, slice, lws_hint());
+  } while (window_collapsed.slide_window_slice_2D(slice));
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
index 24e89db28..a98318323 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
new file mode 100644
index 000000000..ff1904abd
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <climits>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+
+  if (output->tensor_shape().total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    TensorShape output_shape = TensorShape{input->dimension(1)};
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+  }
+
+  return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  TensorShape output_shape = TensorShape{input->dimension(1)};
+
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty(*output, output_shape, 1, input->data_type());
+
+  const unsigned int num_elems_processed_per_iteration = 1;
+
+  // Configure kernel window
+  Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+  AccessWindowStatic output_access(output, 0, 0, output->dimension(0), 1);
+
+  bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+  Status err = (window_changed)
+                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                   : Status{};
+  return std::make_tuple(err, win);
+}
+} // namespace
+
+CLScaleFactorSymm8Kernel::CLScaleFactorSymm8Kernel() : _input(nullptr), _output(nullptr) {}
+
+void CLScaleFactorSymm8Kernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+  _input = input;
+  _output = output;
+
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+      CLKernelLibraryEx::get().create_kernel("scale_factor_symm8", build_opts));
+
+  auto win_config = validate_and_configure_window(input->info(), output->info());
+
+  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+  ICLKernel::configure_internal(std::get<1>(win_config));
+}
+
+Status CLScaleFactorSymm8Kernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+
+  return Status{};
+}
+
+void CLScaleFactorSymm8Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window slice = window_collapsed.first_slice_window_2D();
+  slice.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  do
+  {
+    Window output_slice = slice.shift_dimensions(1);
+
+    unsigned int idx = 0;
+    // Set inputs
+    add_2D_tensor_argument(idx, _input, slice);
+    add_1D_tensor_argument(idx, _output, output_slice);
+    enqueue(queue, *this, slice, lws_hint());
+  } while (window_collapsed.slide_window_slice_2D(slice));
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
index f7836b6cd..8b9b57fd8 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
@@ -147,8 +171,8 @@ void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *
   build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3)));
   if (input->info()->data_type() == DataType::QASYMM8)
   {
-    build_opts.emplace("-DZERO_VALUE=" +
-                       support::cpp11::to_string(input->info()->quantization_info().offset));
+    build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(
+                                             input->info()->quantization_info().uniform().offset));
   }
   else
   {
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
index b085192a2..64fc0384e 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
index 4f2b388c9..151d45e8d 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
index 6cc8d9d13..61999cbd4 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017-2019 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
diff --git a/compute/ARMComputeEx/src/core/CPP/kernels/CPPOneHotKernelEx.cpp b/compute/ARMComputeEx/src/core/CPP/kernels/CPPOneHotKernelEx.cpp
new file mode 100644
index 000000000..d6c49b2b4
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CPP/kernels/CPPOneHotKernelEx.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+namespace arm_compute
+{
+CPPOneHotKernelEx::CPPOneHotKernelEx()
+    : _indices(nullptr), _output(nullptr), _depth(0), _on_value(0), _off_value(0), _axis(-1)
+{
+}
+
+void CPPOneHotKernelEx::configure(const ITensor *indices, ITensor *output, const int depth,
+                                  const float on_value, const float off_value, const int axis)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(indices, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate(indices, depth, on_value, off_value, axis));
+
+  _indices = indices;
+  _output = output;
+  _depth = depth;
+  _on_value = on_value;
+  _off_value = off_value;
+  _axis = axis;
+
+  ICPPKernel::configure(Window()); // Default 1 iteration window
+}
+
+Status CPPOneHotKernelEx::validate(const ITensor *indices, const int depth, const float on_value,
+                                   const float off_value, const int axis)
+{
+  ARM_COMPUTE_UNUSED(on_value, off_value);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(indices, DataType::S32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices->info()->num_dimensions() != 1,
+                                  "Only 1D indices are supported.");
+  ARM_COMPUTE_RETURN_ERROR_ON(depth <= 0);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != -1, "Only axis = -1 is supported.");
+  return Status{};
+}
+
+bool CPPOneHotKernelEx::is_parallelisable() const { return false; }
+
+void CPPOneHotKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
+
+  const auto num_indices = _indices->info()->dimension(0);
+  for (size_t i = 0; i < num_indices; ++i)
+  {
+    const auto index = *reinterpret_cast<int32_t *>(_indices->ptr_to_element(Coordinates{i}));
+    for (int d = 0; d < _depth; ++d)
+      *reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(d, i))) =
+          (d == index) ? _on_value : _off_value;
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
index 8ac667ceb..648afb304 100644
--- a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
 
 #include "arm_compute/core/Error.h"
@@ -81,7 +97,7 @@ void CPPUpsampleKernelEx::run(const Window &window, const ThreadInfo &info)
   // The fill value is normally 0, but for QASYMM8 the '0' corresponds to the offset
   const uint8_t fill_value =
       _output->info()->data_type() == DataType::QASYMM8
-          ? utility::clamp<uint8_t>(_output->info()->quantization_info().offset)
+          ? utility::clamp<uint8_t>(_output->info()->quantization_info().uniform().offset)
           : 0;
   // Filling a value different than 0 works only for QASYMM8 datatype since we are filling 1byte
   // values in a buffer of uint8_ts
diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
index 4508f5800..254c33ea9 100644
--- a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -221,8 +236,9 @@ void elementwise_op_quantized(
   const auto window_end_x = static_cast<int>(window.x().end());
   const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
 
-  const float output_scale = out->info()->quantization_info().scale;
-  const int output_offset = out->info()->quantization_info().offset;
+  UniformQuantizationInfo qinfo = out->info()->quantization_info().uniform();
+  const float output_scale = qinfo.scale;
+  const int output_offset = qinfo.offset;
 
   // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from
   // zero)
@@ -238,8 +254,10 @@ void elementwise_op_quantized(
     const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
     const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
 
-    const QuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info();
-    const QuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info();
+    const UniformQuantizationInfo broadcast_qinfo =
+        broadcast_tensor->info()->quantization_info().uniform();
+    const UniformQuantizationInfo non_broadcast_qinfo =
+        non_broadcast_tensor->info()->quantization_info().uniform();
 
     const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
     const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
@@ -269,10 +287,8 @@ void elementwise_op_quantized(
           for (; x < window_end_x; ++x)
           {
             const float afs =
-                scvt_f32_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo.scale,
-                                 non_broadcast_qinfo.offset);
-            const float bfs =
-                scvt_f32_qasymm8(broadcast_value, broadcast_qinfo.scale, broadcast_qinfo.offset);
+                dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
+            const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
             *(output_ptr + x) =
                 (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs,
                                out->info()->quantization_info());
@@ -283,12 +299,14 @@ void elementwise_op_quantized(
   else
   {
     // Input1 quantization info
-    const int32x4_t voffset1 = vdupq_n_s32(in1->info()->quantization_info().offset);
-    const float32x4_t vscale1 = vdupq_n_f32(in1->info()->quantization_info().scale);
+    UniformQuantizationInfo qinfo = in1->info()->quantization_info().uniform();
+    const int32x4_t voffset1 = vdupq_n_s32(qinfo.offset);
+    const float32x4_t vscale1 = vdupq_n_f32(qinfo.scale);
 
     // Input2 quantization info
-    const int32x4_t voffset2 = vdupq_n_s32(in2->info()->quantization_info().offset);
-    const float32x4_t vscale2 = vdupq_n_f32(in2->info()->quantization_info().scale);
+    qinfo = in2->info()->quantization_info().uniform();
+    const int32x4_t voffset2 = vdupq_n_s32(qinfo.offset);
+    const float32x4_t vscale2 = vdupq_n_f32(qinfo.scale);
 
     // Clear X Dimension on execution window as we handle manually
     input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -301,26 +319,24 @@ void elementwise_op_quantized(
     Iterator input2(in2, input2_win);
     Iterator output(out, win);
 
-    execute_window_loop(
-        win,
-        [&](const Coordinates &) {
-          const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-          const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-          const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+    execute_window_loop(win,
+                        [&](const Coordinates &) {
+                          const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+                          const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+                          const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
 
-          int x =
-              (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr,
-                           output_ptr, voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
-          for (; x < window_end_x; ++x)
-          {
-            const float afs =
-                scvt_f32_qasymm8(*(input1_ptr + x), input1_qinfo.scale, input1_qinfo.offset);
-            const float bfs =
-                scvt_f32_qasymm8(*(input2_ptr + x), input2_qinfo.scale, input2_qinfo.offset);
-            *(output_ptr + x) = (*scalar_func)(afs, bfs, out->info()->quantization_info());
-          }
-        },
-        input1, input2, output);
+                          int x = (*neon_func)(window_start_x, window_end_x, window_step_x,
+                                               input1_ptr, input2_ptr, output_ptr, voffset1,
+                                               voffset2, vscale1, vscale2, voffseto, invvscaleo);
+                          for (; x < window_end_x; ++x)
+                          {
+                            const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
+                            const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
+                            *(output_ptr + x) =
+                                (*scalar_func)(afs, bfs, out->info()->quantization_info());
+                          }
+                        },
+                        input1, input2, output);
   }
 }
 
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEActivationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEActivationLayerKernelEx.cpp
new file mode 100644
index 000000000..648705ba9
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEActivationLayerKernelEx.cpp
@@ -0,0 +1,730 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/NEON/NESymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <array>
+#include <cmath>
+#include <map>
+#include <set>
+
+using namespace arm_compute;
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const ActivationLayerInfo &activation_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
+
+  static std::set<ActivationLayerInfo::ActivationFunction> qasymm8_supported_activations = {
+      ActivationLayerInfo::ActivationFunction::RELU,
+      ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+      ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+      ActivationLayerInfo::ActivationFunction::LOGISTIC,
+      ActivationLayerInfo::ActivationFunction::TANH};
+  static std::set<ActivationLayerInfo::ActivationFunction> qsymm16_supported_activations = {
+      ActivationLayerInfo::ActivationFunction::LOGISTIC,
+      ActivationLayerInfo::ActivationFunction::TANH};
+  const DataType data_type = input->data_type();
+  const QuantizationInfo &oq_info =
+      (output != nullptr) ? output->quantization_info() : input->quantization_info();
+  const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation();
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      is_data_type_quantized_asymmetric(data_type) &&
+          (qasymm8_supported_activations.count(f_act) == 0),
+      "For QASYMM8 only tanh, logistic, relu and lower/upper bounded relu are supported");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) &&
+                                      (qsymm16_supported_activations.count(f_act) == 0),
+                                  "For QSYMM16 only tanh and logistic are supported");
+  ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(data_type) &&
+                              (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                              (oq_info != QuantizationInfo(1.f / 128.f, 128)));
+  ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(data_type) &&
+                              (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                              (oq_info != QuantizationInfo(1.f / 256.f, 0)));
+
+  ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                              (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                              (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+  ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                              (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                              (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+
+  // Checks performed when output is configured
+  if ((output != nullptr) && (output->total_size() != 0))
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  }
+
+  return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+  // Configure kernel window
+  Window win = calculate_max_window(*input, Steps());
+
+  if (output != nullptr)
+  {
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, *input->clone());
+
+    // NEActivationLayerKernelEx doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+  }
+
+  return std::make_pair(Status{}, win);
+}
+
+inline uint32x4_t vreinterpret_unsigend_int(const float32x4_t &vec)
+{
+  return vreinterpretq_u32_f32(vec);
+}
+
+inline float32x4_t vreinterpret_floating_point(const uint32x4_t &vec)
+{
+  return vreinterpretq_f32_u32(vec);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline uint16x8_t vreinterpret_unsigend_int(const float16x8_t &vec)
+{
+  return vreinterpretq_u16_f16(vec);
+}
+inline float16x8_t vreinterpret_floating_point(const uint16x8_t &vec)
+{
+  return vreinterpretq_f16_u16(vec);
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
+} // namespace
+
+NEActivationLayerKernelEx::NEActivationLayerKernelEx()
+    : _input(nullptr), _output(nullptr), _func(nullptr), _act_info()
+{
+}
+
+void NEActivationLayerKernelEx::configure(ITensor *input, ITensor *output,
+                                          ActivationLayerInfo activation_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  _input = input;
+  _act_info = activation_info;
+  _output = input;
+
+  // Out-of-place calculation
+  if (output != nullptr)
+  {
+    _output = output;
+  }
+
+  // Disabled activation, thus no operation needed
+  if (!activation_info.enabled())
+  {
+    _func = nullptr;
+  }
+
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+      input->info(), (output != nullptr) ? output->info() : nullptr, activation_info));
+
+  // Activation functions : FP32
+  static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 = {
+      {ActivationFunction::ABS,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::ABS, float>},
+      {ActivationFunction::LINEAR,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::LINEAR, float>},
+      {ActivationFunction::LOGISTIC,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::LOGISTIC, float>},
+      {ActivationFunction::RELU,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::RELU, float>},
+      {ActivationFunction::BOUNDED_RELU,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::BOUNDED_RELU, float>},
+      {ActivationFunction::LU_BOUNDED_RELU,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::LU_BOUNDED_RELU, float>},
+      {ActivationFunction::LEAKY_RELU,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::LEAKY_RELU, float>},
+      {ActivationFunction::SOFT_RELU,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::SOFT_RELU, float>},
+      {ActivationFunction::ELU,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::ELU, float>},
+      {ActivationFunction::SQRT,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::SQRT, float>},
+      {ActivationFunction::SQUARE,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::SQUARE, float>},
+      {ActivationFunction::TANH,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::TANH, float>},
+      {ActivationFunction::IDENTITY,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::IDENTITY, float>},
+  };
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+  // Activation functions : FP16
+  static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f16 = {
+      {ActivationFunction::ABS,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::ABS, float16_t>},
+      {ActivationFunction::LINEAR,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::LINEAR, float16_t>},
+      {ActivationFunction::LOGISTIC,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::LOGISTIC, float16_t>},
+      {ActivationFunction::RELU,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::RELU, float16_t>},
+      {ActivationFunction::BOUNDED_RELU,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::BOUNDED_RELU, float16_t>},
+      {ActivationFunction::LU_BOUNDED_RELU,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::LU_BOUNDED_RELU, float16_t>},
+      {ActivationFunction::LEAKY_RELU,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::LEAKY_RELU, float16_t>},
+      {ActivationFunction::SOFT_RELU,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::SOFT_RELU, float16_t>},
+      {ActivationFunction::ELU,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::ELU, float16_t>},
+      {ActivationFunction::SQRT,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::SQRT, float16_t>},
+      {ActivationFunction::SQUARE,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::SQUARE, float16_t>},
+      {ActivationFunction::TANH,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::TANH, float16_t>},
+      {ActivationFunction::IDENTITY,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::IDENTITY, float16_t>},
+  };
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
+
+  // Activation functions : QASYMM8
+  static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qasymm8 = {
+      {ActivationFunction::LOGISTIC,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::LOGISTIC, qasymm8_t>},
+      {ActivationFunction::BOUNDED_RELU,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::BOUNDED_RELU, qasymm8_t>},
+      {ActivationFunction::LU_BOUNDED_RELU,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::LU_BOUNDED_RELU, qasymm8_t>},
+      {ActivationFunction::RELU,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::RELU, qasymm8_t>},
+      {ActivationFunction::TANH,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::TANH, qasymm8_t>},
+      {ActivationFunction::IDENTITY,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::IDENTITY, qasymm8_t>},
+  };
+
+  // Activation functions : QSYMM16
+  static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qsymm16 = {
+      {ActivationFunction::LOGISTIC,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::LOGISTIC, qsymm16_t>},
+      {ActivationFunction::TANH,
+       &NEActivationLayerKernelEx::activation<ActivationFunction::TANH, qsymm16_t>},
+  };
+
+  switch (input->info()->data_type())
+  {
+    case DataType::QASYMM8:
+      _func = act_map_qasymm8[activation_info.activation()];
+      break;
+    case DataType::QSYMM16:
+      _func = act_map_qsymm16[activation_info.activation()];
+      break;
+    case DataType::F32:
+      _func = act_map_f32[activation_info.activation()];
+      break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    case DataType::F16:
+      _func = act_map_f16[activation_info.activation()];
+      break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    default:
+      ARM_COMPUTE_ERROR("Unsupported data type.");
+  }
+
+  // Configure kernel window
+  auto win_config =
+      validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr);
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICPPKernel::configure(win_config.second);
+}
+
+template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
+NEActivationLayerKernelEx::activation(const Window &window)
+{
+  /** NEON vector tag type. */
+  using ExactTagType =
+      typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+  const int window_step_x = 16 / sizeof(T);
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+  const ActivationFunction act = F;
+
+  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  Iterator input(_input, win_collapsed);
+  Iterator output(_output, win_collapsed);
+
+  const auto infinity = wrapper::vdup_n(std::numeric_limits<T>::infinity(), ExactTagType{});
+  const auto epsilon = wrapper::vdup_n(static_cast<T>(1e-24), ExactTagType{});
+  const auto const_1 = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
+  const auto const_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+  const auto va = wrapper::vdup_n(static_cast<T>(_act_info.a()), ExactTagType{});
+  const auto vb = wrapper::vdup_n(static_cast<T>(_act_info.b()), ExactTagType{});
+  const auto a = static_cast<T>(_act_info.a());
+  const auto b = static_cast<T>(_act_info.b());
+
+  execute_window_loop(
+      win_collapsed,
+      [&](const Coordinates &) {
+        const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+
+        for (; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+          const auto vin = wrapper::vloadq(input_ptr + x);
+          switch (act)
+          {
+            case ActivationFunction::ABS:
+              tmp = wrapper::vabs(vin);
+              break;
+            case ActivationFunction::LINEAR:
+              tmp = wrapper::vmla(vb, va, vin);
+              break;
+            case ActivationFunction::LOGISTIC:
+              // exp(-vin)
+              tmp = wrapper::vexpq(wrapper::vneg(vin));
+
+              // NaN -> INF
+              tmp = vreinterpret_floating_point(wrapper::vorr(
+                  wrapper::vand(wrapper::vnot(wrapper::vceq(tmp, tmp)),
+                                vreinterpret_unsigend_int(infinity)),
+                  wrapper::vand(wrapper::vceq(tmp, tmp), vreinterpret_unsigend_int(tmp))));
+
+              // 1 / 1 + tmp
+              tmp = wrapper::vinv(wrapper::vadd(const_1, tmp));
+              break;
+            case ActivationFunction::RELU:
+              tmp = wrapper::vmax(const_0, vin);
+              break;
+            case ActivationFunction::BOUNDED_RELU:
+              tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
+              break;
+            case ActivationFunction::LU_BOUNDED_RELU:
+              tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
+              break;
+            case ActivationFunction::LEAKY_RELU:
+              tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
+              break;
+            case ActivationFunction::SOFT_RELU:
+              tmp = wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)));
+              break;
+            case ActivationFunction::ELU:
+              tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin,
+                                  wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
+              break;
+            case ActivationFunction::SQRT:
+              tmp = wrapper::vinv(wrapper::vinvsqrt(vin + epsilon));
+              break;
+            case ActivationFunction::SQUARE:
+              tmp = wrapper::vmul(vin, vin);
+              break;
+            case ActivationFunction::TANH:
+              tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
+              break;
+            case ActivationFunction::IDENTITY:
+              tmp = vin;
+              break;
+            default:
+              ARM_COMPUTE_ERROR("Unsupported activation function");
+          }
+          wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for (; x < window_end_x; ++x)
+        {
+          const T in = *(reinterpret_cast<const T *>(input_ptr + x));
+          T tmp;
+          switch (act)
+          {
+            case ActivationFunction::ABS:
+              tmp = std::abs(in);
+              break;
+            case ActivationFunction::LINEAR:
+              tmp = a * in + b;
+              break;
+            case ActivationFunction::LOGISTIC:
+              tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
+              break;
+            case ActivationFunction::RELU:
+              tmp = std::max<T>(static_cast<T>(0), in);
+              break;
+            case ActivationFunction::BOUNDED_RELU:
+              tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
+              break;
+            case ActivationFunction::LU_BOUNDED_RELU:
+              tmp = std::min<T>(a, std::max<T>(b, in));
+              break;
+            case ActivationFunction::LEAKY_RELU:
+              tmp = (in > 0) ? in : a * in;
+              break;
+            case ActivationFunction::SOFT_RELU:
+              tmp = std::log(static_cast<T>(1) + std::exp(in));
+              break;
+            case ActivationFunction::ELU:
+              tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
+              break;
+            case ActivationFunction::SQRT:
+              tmp = std::sqrt(in);
+              break;
+            case ActivationFunction::SQUARE:
+              tmp = in * in;
+              break;
+            case ActivationFunction::TANH:
+              tmp = a * std::tanh(b * in);
+              break;
+            case ActivationFunction::IDENTITY:
+              tmp = in;
+              break;
+            default:
+              ARM_COMPUTE_ERROR("Unsupported activation function");
+          }
+          *(output_ptr + x) = tmp;
+        }
+      },
+      input, output);
+}
+
+template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type
+NEActivationLayerKernelEx::activation(const Window &window)
+{
+  const int window_step_x = 16 / sizeof(T);
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+  const ActivationFunction act = F;
+
+  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  Iterator input(_input, win_collapsed);
+  Iterator output(_output, win_collapsed);
+
+  const UniformQuantizationInfo qi_in = _input->info()->quantization_info().uniform();
+  const UniformQuantizationInfo qi_out = _output->info()->quantization_info().uniform();
+  const qasymm8x16_t va = vdupq_n_u8(quantize_qasymm8(_act_info.a(), qi_in));
+  const qasymm8x16_t vb = vdupq_n_u8(quantize_qasymm8(_act_info.b(), qi_in));
+  const qasymm8_t a = quantize_qasymm8(_act_info.a(), qi_in);
+  const qasymm8_t b = quantize_qasymm8(_act_info.b(), qi_in);
+  const qasymm8_t const_0 = quantize_qasymm8(0.f, qi_in);
+  const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0);
+  const auto vconst_1 = vdupq_n_f32(1.f);
+  const float32x4_t va_f32 = vdupq_n_f32(_act_info.a());
+  const float32x4_t vb_f32 = vdupq_n_f32(_act_info.b());
+  const float a_f32 = _act_info.a();
+  const float b_f32 = _act_info.b();
+
+  // Initialise scale/offset for re-quantization
+  float s = qi_in.scale / qi_out.scale;
+  float o = -qi_in.offset * s + qi_out.offset;
+  float32x4_t vs = vdupq_n_f32(s);
+  float32x4_t vo = vdupq_n_f32(o);
+
+  execute_window_loop(
+      win_collapsed,
+      [&](const Coordinates &) {
+        const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for (; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+          const auto vin = wrapper::vloadq(input_ptr + x);
+          if (act == ActivationFunction::RELU)
+          {
+            // Perform activation
+            tmp = vmaxq_u8(vconst_0, vin);
+            // Re-quantize to new output space
+            tmp = vmlaq_qasymm8(tmp, vs, vo);
+          }
+          else if (act == ActivationFunction::BOUNDED_RELU)
+          {
+            // Perform activation
+            tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
+            // Re-quantize to new output space
+            tmp = vmlaq_qasymm8(tmp, vs, vo);
+          }
+          else if (act == ActivationFunction::LU_BOUNDED_RELU)
+          {
+            // Perform activation
+            tmp = vminq_u8(va, vmaxq_u8(vb, vin));
+            // Re-quantize to new output space
+            tmp = vmlaq_qasymm8(tmp, vs, vo);
+          }
+          else if (act == ActivationFunction::LOGISTIC)
+          {
+            // De-quantize
+            const auto vin_deq = vdequantize(vin, qi_in);
+            // Perform activation
+            const float32x4x4_t tmp_dep = {{
+                wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
+                                                                    vin_deq.val[0])))),
+                wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
+                                                                    vin_deq.val[1])))),
+                wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
+                                                                    vin_deq.val[2])))),
+                wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
+                                                                    vin_deq.val[3])))),
+            }};
+            // Re-quantize to new output space
+            tmp = vquantize(tmp_dep, qi_out);
+          }
+          else if (act == ActivationFunction::TANH)
+          {
+            // De-quantize
+            const auto vin_deq = vdequantize(vin, qi_in);
+            // Perform activation
+            const float32x4x4_t tmp_dep = {{
+                wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
+                wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
+                wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
+                wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
+            }};
+            // Re-quantize to new output space
+            tmp = vquantize(tmp_dep, qi_out);
+          }
+          else
+          {
+            ARM_COMPUTE_ERROR("Unsupported activation function");
+          }
+          wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for (; x < window_end_x; ++x)
+        {
+          T in = *(reinterpret_cast<const T *>(input_ptr + x));
+          T tmp;
+          if (act == ActivationFunction::RELU)
+          {
+            tmp = std::max(const_0, in);
+            tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255));
+          }
+          else if (act == ActivationFunction::BOUNDED_RELU)
+          {
+            tmp = std::min(a, std::max(const_0, in));
+            tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255));
+          }
+          else if (act == ActivationFunction::LU_BOUNDED_RELU)
+          {
+            tmp = std::min(a, std::max(b, in));
+            tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255));
+          }
+          else if (act == ActivationFunction::LOGISTIC)
+          {
+            float tmp_f = dequantize_qasymm8(in, qi_in);
+            tmp_f = 1.f / (1.f + std::exp(-tmp_f));
+            tmp = quantize_qasymm8(tmp_f, qi_out);
+          }
+          else if (act == ActivationFunction::TANH)
+          {
+            float tmp_f = dequantize_qasymm8(in, qi_in);
+            tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
+            tmp = quantize_qasymm8(tmp_f, qi_out);
+          }
+          else
+          {
+            ARM_COMPUTE_ERROR("Unsupported activation function");
+          }
+          *(output_ptr + x) = tmp;
+        }
+      },
+      input, output);
+}
+
+template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<std::is_same<T, qsymm16_t>::value, void>::type
+NEActivationLayerKernelEx::activation(const Window &window)
+{
+  const int window_step_x = 16 / sizeof(T);
+  const auto window_start_x = static_cast<int>(window.x().start());
+  const auto window_end_x = static_cast<int>(window.x().end());
+  const ActivationFunction act = F;
+
+  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  Iterator input(_input, win_collapsed);
+  Iterator output(_output, win_collapsed);
+
+  const UniformQuantizationInfo qi_in = _input->info()->quantization_info().uniform();
+  const UniformQuantizationInfo qi_out = _output->info()->quantization_info().uniform();
+  const auto vconst_1 = vdupq_n_f32(1.f);
+  const float32x4_t va_f32 = vdupq_n_f32(_act_info.a());
+  const float32x4_t vb_f32 = vdupq_n_f32(_act_info.b());
+  const float a_f32 = _act_info.a();
+  const float b_f32 = _act_info.b();
+
+  execute_window_loop(
+      win_collapsed,
+      [&](const Coordinates &) {
+        const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
+        ARM_COMPUTE_UNUSED(tmp);
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for (; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+          const auto vin = wrapper::vloadq(input_ptr + x);
+          if (act == ActivationFunction::LOGISTIC)
+          {
+            // De-quantize
+            const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+            // Perform activation
+            const float32x4x2_t tmp_dep = {{
+                wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
+                                                                    vin_deq.val[0])))),
+                wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(
+                                                                    vin_deq.val[1])))),
+            }};
+            // Re-quantize to new output space
+            tmp = vquantize_int16(tmp_dep, qi_out.scale);
+          }
+          else if (act == ActivationFunction::TANH)
+          {
+            // De-quantize
+            const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+            // Perform activation
+            const float32x4x2_t tmp_dep = {{
+                wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
+                wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
+            }};
+            // Re-quantize to new output space
+            tmp = vquantize_int16(tmp_dep, qi_out.scale);
+          }
+          else
+          {
+            ARM_COMPUTE_ERROR("Unsupported activation function");
+          }
+          wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for (; x < window_end_x; ++x)
+        {
+          T in = *(reinterpret_cast<const T *>(input_ptr + x));
+          T tmp;
+          if (act == ActivationFunction::LOGISTIC)
+          {
+            float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+            tmp_f = 1.f / (1.f + std::exp(-tmp_f));
+            tmp = quantize_qsymm16(tmp_f, qi_out);
+          }
+          else if (act == ActivationFunction::TANH)
+          {
+            float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+            tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
+            tmp = quantize_qsymm16(tmp_f, qi_out);
+          }
+          else
+          {
+            ARM_COMPUTE_ERROR("Unsupported activation function");
+          }
+          *(output_ptr + x) = tmp;
+        }
+      },
+      input, output);
+}
+
+Status NEActivationLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                           const ActivationLayerInfo &act_info)
+{
+  ARM_COMPUTE_UNUSED(act_info);
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      validate_and_configure_window(input->clone().get(),
+                                    (output != nullptr) ? output->clone().get() : nullptr)
+          .first);
+
+  return Status{};
+}
+
+void NEActivationLayerKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  // Early exit on disabled activation
+  if (!_act_info.enabled())
+  {
+    return;
+  }
+
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+  ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+  (this->*_func)(window);
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
index d2f42de53..32d7d6237 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h"
 
 #include "arm_compute/core/Error.h"
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
index 7e4fc129b..fbb9dbca9 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/NEON/kernels/NECastKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
@@ -394,7 +410,8 @@ template <typename FromT> void run_cast(const ITensor *input, ITensor *output, c
             case DataType::QASYMM8:
             {
               using to_vector = typename cast_vector<float>::type;
-              const QuantizationInfo &qinfo_out = output->info()->quantization_info();
+              const UniformQuantizationInfo &qinfo_out =
+                  output->info()->quantization_info().uniform();
               const auto vf = vcast<to_vector, from_vector>(vin);
               const auto vout = vquantize(vf, qinfo_out);
               store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
@@ -440,7 +457,8 @@ template <typename FromT> void run_cast(const ITensor *input, ITensor *output, c
             case DataType::QASYMM8:
             {
               const QuantizationInfo &qinfo_out = output->info()->quantization_info();
-              const auto qval = qinfo_out.quantize(static_cast<float>(val), rounding_policy);
+              const auto qval =
+                  quantize_qasymm8(static_cast<float>(val), qinfo_out, rounding_policy);
               *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval;
               break;
             }
@@ -486,8 +504,8 @@ void run_cast_qasymm8(const ITensor *input, ITensor *output, const Window &windo
 #else  //__aarch64__
   constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
 #endif //__aarch64__
-  const auto &qinfo_in = input->info()->quantization_info();
-  const auto &qinfo_out = output->info()->quantization_info();
+  const auto &qinfo_in = input->info()->quantization_info().uniform();
+  const auto &qinfo_out = output->info()->quantization_info().uniform();
 
   execute_window_loop(
       win_collapsed,
@@ -547,7 +565,7 @@ void run_cast_qasymm8(const ITensor *input, ITensor *output, const Window &windo
         for (; x < window_end_x; ++x)
         {
           qasymm8_t qval_in = *(in_ptr + x);
-          const auto val = qinfo_in.dequantize(qval_in);
+          const auto val = dequantize_qasymm8(qval_in, qinfo_in);
 
           switch (output->info()->data_type())
           {
@@ -558,7 +576,7 @@ void run_cast_qasymm8(const ITensor *input, ITensor *output, const Window &windo
             }
             case DataType::QASYMM8:
             {
-              const auto qval_out = qinfo_out.quantize(val, rounding_policy);
+              const auto qval_out = quantize_qasymm8(val, qinfo_out, rounding_policy);
               *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval_out;
               break;
             }
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
index 8a2223c26..95e269dee 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
 
 #include "arm_compute/core/Helpers.h"
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
index cebd614df..200fc4f87 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h"
 
 #include "arm_compute/core/CPP/Validate.h"
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
index 5401afea0..091d38c56 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
 
 #include "arm_compute/core/Error.h"
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
index ce2413dc1..4c0a5e799 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
 
 #include "arm_compute/core/CPP/Validate.h"
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
index 391337bfb..30787c0a4 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
 
 #include "arm_compute/core/Error.h"
@@ -118,7 +134,7 @@ void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info)
 
   const size_t lookup_dim = _output->info()->num_dimensions() - 1;
   const int const_0 = _output->info()->data_type() == DataType::QASYMM8
-                          ? _output->info()->quantization_info().offset
+                          ? _output->info()->quantization_info().uniform().offset
                           : 0;
 
   std::unordered_map<int32_t, size_t> key_index_map;
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
index 1ea77fb5c..49adf1462 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h"
 
 #include "arm_compute/core/CPP/Validate.h"
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
index de218d489..b92130cec 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
 
 #include "arm_compute/core/Error.h"
@@ -71,12 +87,6 @@ inline int32x4x4_t load_value(const int32_t *input_ptr)
           wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)};
 }
 
-inline float32x4x4_t load_value(const float *input_ptr)
-{
-  return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4),
-          wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)};
-}
-
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 inline const float32x4x4_t load_value(const float16_t *input_ptr)
 {
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
index ad1bb9051..641641b5a 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
 
 #include "arm_compute/core/ITensor.h"
@@ -63,7 +79,8 @@ template <ConditionalOperation op>
 inline uint8_t elementwise_conditional_op_quantized_scalar(const float &a, const float &b,
                                                            QuantizationInfo qinfo)
 {
-  return qinfo.quantize(elementwise_conditional_op_scalar<op>(a, b), RoundingPolicy::TO_NEAREST_UP);
+  return quantize_qasymm8(elementwise_conditional_op_scalar<op>(a, b), qinfo,
+                          RoundingPolicy::TO_NEAREST_UP);
 }
 
 template <ConditionalOperation op, typename VectorType>
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
index acf0092eb..6ba0f1fd4 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h"
 
 #include "arm_compute/core/Error.h"
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp
index 59e7d9beb..3b65eac10 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h"
 
 #include "arm_compute/core/CPP/Validate.h"
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
index 36a2f55a9..44feb200f 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
 
 #include "arm_compute/core/Helpers.h"
diff --git a/compute/ARMComputeEx/src/core/UtilsEx.cpp b/compute/ARMComputeEx/src/core/UtilsEx.cpp
index 94242b56b..863316909 100644
--- a/compute/ARMComputeEx/src/core/UtilsEx.cpp
+++ b/compute/ARMComputeEx/src/core/UtilsEx.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/core/UtilsEx.h"
 #include "arm_compute/core/Error.h"
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
index ae64a6edd..2d379cf36 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/runtime/CL/functions/CLArgOperation.h"
 
 #include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
index 7c5fe5eda..92ee69a36 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h"
 
 #include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
index 742fc6f59..b3118f39e 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/runtime/CL/functions/CLCast.h"
 
 #include "arm_compute/core/CL/kernels/CLCastKernel.h"
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
index c2e4ca9ff..db662505a 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/runtime/CL/functions/CLDepthToSpace.h"
 
 #include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
index 2781784ca..3d9a28a48 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h"
 
 #include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
new file mode 100644
index 000000000..f098832b0
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h"
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+#include <algorithm>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+{
+  ARM_COMPUTE_UNUSED(input);
+  ARM_COMPUTE_UNUSED(weights);
+  ARM_COMPUTE_UNUSED(output);
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      CLGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output));
+
+  return Status{};
+}
+} // namespace
+
+void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
+}
+
+Status CLFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input,
+                                                           const ITensorInfo *output)
+{
+  return CLTransposeKernel::validate(input, output);
+}
+
+CLFullyConnectedHybridLayer::CLFullyConnectedHybridLayer(
+    std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(),
+      _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(),
+      _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(),
+      _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false),
+      _original_weights(nullptr)
+{
+}
+void CLFullyConnectedHybridLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights,
+                                               ICLTensor *output, bool retain_internal_weights)
+{
+  ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+  ARM_COMPUTE_UNUSED(output);
+  ARM_COMPUTE_UNUSED(retain_internal_weights);
+  // Configure gemmlowp function
+  _mm_gemmlowp.configure(input, weights, nullptr, output);
+}
+
+void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTensor *weights,
+                                            const ICLTensor *biases, ICLTensor *output,
+                                            FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  // Perform validate step
+  ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedHybridLayer::validate(
+      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+      fc_info));
+
+  _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  _accumulate_biases = false;
+  _is_prepared = fc_info.retain_internal_weights;
+  _original_weights = weights;
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr)
+  {
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+
+    _accumulate_biases = true;
+
+    // Configure accumulate biases kernel
+    _accumulate_biases_kernel.set_target(CLScheduler::get().target());
+    _accumulate_biases_kernel.configure(output, biases);
+  }
+
+  const ICLTensor *weights_to_use = weights;
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+  bool is_fc_after_conv = false;
+  if (is_batched_fc_layer)
+  {
+    is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                       (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                   input->info()->tensor_shape().cend(),
+                                   output->info()->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    is_fc_after_conv = input->info()->num_dimensions() > 1 && input->info()->dimension(1) > 1;
+  }
+  ARM_COMPUTE_ERROR_ON_MSG(is_fc_after_conv,
+                           "CLFullyConnectedHybridLayer does not support after conv");
+  ARM_COMPUTE_UNUSED(is_fc_after_conv);
+
+  // Reshape weights if needed
+  if (!_are_weights_reshaped)
+  {
+    // Reshape the weights
+    _reshape_weights_output.allocator()->init(
+        weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+            compute_transposed_shape(*weights->info())));
+    _reshape_weights_kernel.configure(weights_to_use, &_reshape_weights_output);
+    weights_to_use = &_reshape_weights_output;
+  }
+
+  // Extract scale factor
+  _scale_factor.allocator()->init(
+      TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type()));
+  _memory_group.manage(&_scale_factor);
+  _scale_factor_kernel.configure(input, &_scale_factor);
+
+  // Quantize input
+  _quantized_input.allocator()->init(
+      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+  _memory_group.manage(&_quantized_input);
+  _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input);
+
+  // GEMMLowp
+  _gemmlowp_output.allocator()->init(
+      output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+  _memory_group.manage(&_gemmlowp_output);
+  configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output,
+               fc_info.retain_internal_weights);
+  _quantized_input.allocator()->allocate();
+
+  // Multiply scale
+  _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output,
+                                   weights->info()->quantization_info().uniform().scale);
+  _gemmlowp_output.allocator()->allocate();
+  _scale_factor.allocator()->allocate();
+
+  _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+}
+
+Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                             const ITensorInfo *biases, const ITensorInfo *output,
+                                             FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+
+  bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  bool is_fc_after_conv = true;
+  const GPUTarget gpu_target = CLScheduler::get().target();
+
+  const ITensorInfo &reshaped_weights =
+      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          compute_transposed_shape(*weights)));
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target));
+  }
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensorInfo *weights_to_use = weights;
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->dimension(1) > 1;
+  if (is_batched_fc_layer)
+  {
+    is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                       (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(),
+                                   output->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    is_fc_after_conv = input->num_dimensions() > 1 && input->dimension(1) > 1;
+  }
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_fc_after_conv,
+                                  "CLFullyConnectedHybridLayer does not support after conv");
+
+  if (!weights_reshaped)
+  {
+    // Validate reshape weights kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
+    weights_to_use = &reshaped_weights;
+  }
+
+  // Validate Scale factor kernel
+  const ITensorInfo &scale_factor =
+      TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type());
+  ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor));
+
+  // Validate quantization symm8 kernel
+  const ITensorInfo &quantized_input = TensorInfo(
+      input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input));
+
+  // Fully Connected layer after a Fully Connected Layer without batches
+  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+
+  // Validate matrix multiply kernel
+  const ITensorInfo &gemmlowp_output = TensorInfo(
+      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output));
+
+  // Multiply scale
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output));
+
+  return Status{};
+}
+
+void CLFullyConnectedHybridLayer::run()
+{
+  prepare();
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Extract scale_factor
+  CLScheduler::get().enqueue(_scale_factor_kernel);
+
+  // Quantize input
+  CLScheduler::get().enqueue(_quant_input_kernel);
+
+  // Run matrix multiply
+  _mm_gemmlowp.run();
+
+  // Multiply scale factor
+  CLScheduler::get().enqueue(_multiply_scale_kernel);
+
+  // Accumulate biases if provided
+  if (_accumulate_biases)
+  {
+    CLScheduler::get().enqueue(_accumulate_biases_kernel);
+  }
+}
+
+void CLFullyConnectedHybridLayer::prepare()
+{
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    auto release_unused = [](CLTensor *w) {
+      if (!w->is_used())
+      {
+        CLScheduler::get().queue().finish();
+        w->allocator()->free();
+      }
+    };
+
+    // Reshape of the weights if needed (happens only once)
+    if (!_are_weights_reshaped)
+    {
+      // Run reshape weights kernel and mark weights as unused
+      _reshape_weights_output.allocator()->allocate();
+      _reshape_weights_kernel.run();
+
+      _are_weights_reshaped = true;
+      // We can not release _original_weights because it can be used in other nodes
+    }
+
+    // Prepare GEMM prepare and release unused weights
+    _mm_gemmlowp.prepare();
+
+    // Release reshaped weights if unused
+    release_unused(&_reshape_weights_output);
+
+    _is_prepared = true;
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
new file mode 100644
index 000000000..63e291b36
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h"
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::utils::cast;
+
+namespace
+{
+Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorInfo &weights,
+                                       const ITensorInfo &output,
+                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage)
+{
+  gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+  gemmlowp_output_stage.gemmlowp_offset = 0;
+  gemmlowp_output_stage.gemmlowp_multiplier = 0;
+  gemmlowp_output_stage.gemmlowp_shift = 0;
+
+  // Configure output stage for quantized case
+  if (is_data_type_quantized_asymmetric(input.data_type()))
+  {
+    const UniformQuantizationInfo iq_info = input.quantization_info().uniform();
+    const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
+    const UniformQuantizationInfo oq_info = output.quantization_info().uniform();
+
+    const auto output_quant_info = (output.total_size() == 0) ? iq_info : oq_info;
+
+    const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale;
+    int output_multiplier = 0;
+    int output_shift = 0;
+    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(
+        multiplier, &output_multiplier, &output_shift));
+
+    // Set the GEMMLowp output stage info
+    gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
+    gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
+    gemmlowp_output_stage.gemmlowp_shift = output_shift;
+    gemmlowp_output_stage.gemmlowp_min_bound = 0;
+    gemmlowp_output_stage.gemmlowp_max_bound = 255;
+    gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier);
+    gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift);
+  }
+
+  return Status{};
+}
+
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo *bias,
+                   const ITensorInfo &output, const FullyConnectedLayerInfo &fc_info)
+{
+  GEMMLowpOutputStageInfo gemmlowp_output_stage;
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage));
+
+  const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
+                                       false, // is_b_reshaped
+                                       true,  // reshape_b_only_on_first_run
+                                       0,     // depth_output_gemm3d
+                                       false, // reinterpret_input_as_3d
+                                       fc_info.retain_internal_weights, // retain_internal_weights
+                                       gemmlowp_output_stage,           // gemmlowp_output_stage
+                                       fc_info.fp_mixed_precision,      // fp_mixed_precision
+                                       true,                            // broadcast_bias
+                                       ActivationLayerInfo());          // activation_info
+
+  if (is_data_type_quantized_asymmetric(input.data_type()))
+  {
+    const UniformQuantizationInfo iq_info = input.quantization_info().uniform();
+    const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
+
+    // Since we need negative offsets for computing convolution, we need to change
+    // QuantizationInfo()
+    // Extract and negate input and weights offset
+    const QuantizationInfo input_quantization_info(iq_info.scale, -iq_info.offset);
+    const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset);
+
+    // Validate gemmlowp function
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(
+        &input.clone()->set_quantization_info(input_quantization_info),
+        &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output,
+        gemm_info));
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info));
+  }
+
+  return Status{};
+}
+} // namespace
+
+void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
+}
+
+Status CLFullyConnectedLayerReshapeWeightsEx::validate(const ITensorInfo *input,
+                                                       const ITensorInfo *output)
+{
+  return CLTransposeKernel::validate(input, output);
+}
+
+CLFullyConnectedLayerEx::CLFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager,
+                                                 IWeightsManager *weights_manager)
+    : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(),
+      _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(),
+      _reshape_weights_function(), _mm_gemm(memory_manager, weights_manager),
+      _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(),
+      _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true),
+      _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
+{
+}
+void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTensor *weights,
+                                           const ICLTensor *bias, ICLTensor *output,
+                                           const FullyConnectedLayerInfo &fc_info)
+{
+  GEMMLowpOutputStageInfo gemmlowp_output_stage;
+  construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(),
+                                  gemmlowp_output_stage);
+
+  const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
+                                       false, // is_b_reshaped
+                                       true,  // reshape_b_only_on_first_run
+                                       0,     // depth_output_gemm3d
+                                       false, // reinterpret_input_as_3d
+                                       fc_info.retain_internal_weights, // retain_internal_weights
+                                       gemmlowp_output_stage,           // gemmlowp_output_stage
+                                       fc_info.fp_mixed_precision,      // fp_mixed_precision
+                                       true,                            // broadcast_bias
+                                       ActivationLayerInfo());          // activation_info
+
+  if (_is_quantized)
+  {
+    // Since we need negative offsets for computing convolution, we need to change
+    // QuantizationInfo()
+    // Extract and negate input and weights offset
+    const QuantizationInfo input_quantization_info = input->info()->quantization_info();
+    const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
+
+    input->info()->set_quantization_info(QuantizationInfo(
+        input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+    weights->info()->set_quantization_info(QuantizationInfo(
+        weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+
+    // Configure gemmlowp function
+    _mm_gemmlowp.configure(input, weights, bias, output, gemm_info);
+
+    // Revert back QuantizatioInfo as input and weights could be used in other fully connected
+    // layers
+    input->info()->set_quantization_info(input_quantization_info);
+    weights->info()->set_quantization_info(weights_quantization_info);
+  }
+  else
+  {
+    // Configure matrix multiply kernel
+    _mm_gemm.configure(input, weights, bias, output, 1.f, 1.f, gemm_info);
+  }
+}
+
+void CLFullyConnectedLayerEx::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights,
+                                                const ICLTensor *bias, ICLTensor *output,
+                                                const FullyConnectedLayerInfo &fc_info)
+{
+  ARM_COMPUTE_ERROR_ON(
+      (weights->info()->dimension(1) !=
+       (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+  // If the fully connected layer is called after a convolution layer, the input tensor must be
+  // linearized
+
+  // Initialize output tensor for flatten
+  TensorShape shape_flatten = compute_flatten_shape(input->info());
+  _flatten_output.allocator()->init(input->info()
+                                        ->clone()
+                                        ->set_is_resizable(true)
+                                        .reset_padding()
+                                        .set_tensor_shape(shape_flatten)
+                                        .set_data_layout(DataLayout::NCHW));
+
+  // Configure flatten kernel
+  _memory_group.manage(&_flatten_output);
+  _flatten_layer.configure(input, &_flatten_output);
+
+  // Configure matrix multiply kernel
+  configure_mm(&_flatten_output, weights, bias, output, fc_info);
+
+  // Allocate the output tensor for flatten once all the configure methods have been called
+  _flatten_output.allocator()->allocate();
+}
+
+void CLFullyConnectedLayerEx::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights,
+                                              const ICLTensor *bias, ICLTensor *output,
+                                              const FullyConnectedLayerInfo &fc_info)
+{
+  ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+  // Configure matrix multiply kernel
+  configure_mm(input, weights, bias, output, fc_info);
+}
+
+void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor *weights,
+                                        const ICLTensor *biases, ICLTensor *output,
+                                        FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  // Perform validate step
+  ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayerEx::validate(
+      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+      fc_info));
+
+  _are_weights_converted = true;
+  _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  _is_fc_after_conv = true;
+  _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+  _is_prepared = fc_info.retain_internal_weights;
+  _original_weights = weights;
+
+  if (_weights_manager)
+  {
+    _weights_manager->manage(weights);
+  }
+
+  const ICLTensor *weights_to_use = weights;
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+  if (is_batched_fc_layer)
+  {
+    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                    input->info()->tensor_shape().cend(),
+                                    output->info()->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    _is_fc_after_conv = input->info()->num_dimensions() > 1;
+  }
+
+  // Reshape weights if needed
+  if (!_are_weights_reshaped)
+  {
+    if (_weights_manager && _weights_manager->are_weights_managed(weights))
+    {
+      _reshape_weights_managed_function.configure(weights);
+      weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(
+          _weights_manager->acquire(weights, &_reshape_weights_managed_function));
+    }
+    else
+    {
+      // Reshape the weights
+      _reshape_weights_function.configure(weights, &_reshape_weights_output);
+      weights_to_use = &_reshape_weights_output;
+    }
+  }
+
+  // Convert weights if needed
+  if (_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
+  {
+    if (_weights_manager && _weights_manager->are_weights_managed(weights_to_use))
+    {
+      _convert_weights_managed.configure(weights_to_use, input->info()->tensor_shape(),
+                                         fc_info.weights_trained_layout);
+      weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(
+          _weights_manager->acquire(weights, &_convert_weights_managed));
+    }
+    else
+    {
+      // Convert weights
+      _convert_weights.configure(weights_to_use, &_converted_weights_output,
+                                 input->info()->tensor_shape(), fc_info.weights_trained_layout);
+
+      weights_to_use = &_converted_weights_output;
+    }
+    _are_weights_converted = false;
+  }
+
+  if (_is_fc_after_conv)
+  {
+    // Fully Connected layer after a Convolution Layer without batches
+    configure_conv_fc(input, weights_to_use, biases, output, fc_info);
+  }
+  else
+  {
+    // Fully Connected layer after a Fully Connected Layer without batches
+    configure_fc_fc(input, weights_to_use, biases, output, fc_info);
+  }
+}
+
+Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                         const ITensorInfo *biases, const ITensorInfo *output,
+                                         FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+
+  bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  bool is_fc_after_conv = true;
+
+  const ITensorInfo &flatten_input = TensorInfo(input->clone()
+                                                    ->set_is_resizable(true)
+                                                    .reset_padding()
+                                                    .set_tensor_shape(compute_flatten_shape(input))
+                                                    .set_data_layout(DataLayout::NCHW));
+  const ITensorInfo &reshaped_weights =
+      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          compute_transposed_shape(*weights)));
+  const ITensorInfo &converted_weights =
+      weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+                       : TensorInfo(*reshaped_weights.clone());
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensorInfo *input_to_use = input;
+  const ITensorInfo *weights_to_use = weights;
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->dimension(1) > 1;
+  if (is_batched_fc_layer)
+  {
+    is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                       (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(),
+                                   output->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    is_fc_after_conv = input->num_dimensions() > 1;
+  }
+
+  if (!weights_reshaped)
+  {
+    // Validate reshape weights kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights));
+    weights_to_use = &reshaped_weights;
+  }
+
+  if (is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
+  {
+    // Validate convert weights kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConvertFullyConnectedWeights::validate(
+        weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
+    weights_to_use = &converted_weights;
+  }
+
+  if (is_fc_after_conv)
+  {
+    // Fully Connected layer after a Convolution Layer without batches
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        (weights_to_use->dimension(1) !=
+         (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+
+    // Validate flatten kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input));
+    input_to_use = &flatten_input;
+  }
+  else
+  {
+    // Fully Connected layer after a Fully Connected Layer without batches
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+  }
+
+  // Validate matrix multiply kernel
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info));
+
+  return Status{};
+}
+
+void CLFullyConnectedLayerEx::run()
+{
+  if (!_is_prepared)
+  {
+    if (!_are_weights_reshaped)
+      _reshape_weights_output.allocator()->allocate();
+    if (!_are_weights_converted)
+      _converted_weights_output.allocator()->allocate();
+    _is_prepared = true;
+  }
+
+  {
+    if (!_weights_manager)
+    {
+      ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+    }
+
+    // Pointer to current weights
+    const ICLTensor *cur_weights = _original_weights;
+    // Reshape of the weights
+    if (!_are_weights_reshaped)
+    {
+      if (_weights_manager && _weights_manager->are_weights_managed(cur_weights))
+      {
+        _original_weights = utils::cast::polymorphic_downcast<ICLTensor *>(
+            _weights_manager->run(cur_weights, &_reshape_weights_managed_function));
+      }
+      else
+      {
+        _reshape_weights_function.run();
+        cur_weights = &_reshape_weights_output;
+      }
+    }
+
+    // Convert weights if needed
+    if (!_are_weights_converted)
+    {
+      if (_weights_manager && _weights_manager->are_weights_managed(cur_weights))
+      {
+        _weights_manager->run(cur_weights, &_convert_weights_managed);
+      }
+      else
+      {
+        _convert_weights.run();
+      }
+    }
+
+    // Prepare GEMM prepare
+    if (!_is_quantized)
+    {
+      _mm_gemm.prepare();
+    }
+  }
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Linearize input if it comes from a convolutional layer
+  if (_is_fc_after_conv)
+  {
+    _flatten_layer.run();
+  }
+
+  // Run matrix multiply
+  if (_is_quantized)
+  {
+    _mm_gemmlowp.run();
+  }
+  else
+  {
+    _mm_gemm.run();
+  }
+}
+
+void CLFullyConnectedLayerEx::prepare()
+{
+#if 0 // TODO Remove this block
+    if(!_is_prepared)
+    {
+        if(!_weights_manager)
+        {
+            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+        }
+
+        auto release_unused = [](CLTensor * w)
+        {
+            if(!w->is_used())
+            {
+                CLScheduler::get().queue().finish();
+                w->allocator()->free();
+            }
+        };
+
+        // Pointer to current weights
+        const ICLTensor *cur_weights = _original_weights;
+
+        // Reshape of the weights if needed (happens only once)
+        if(!_are_weights_reshaped)
+        {
+            if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
+            {
+                cur_weights = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->run(cur_weights, &_reshape_weights_managed_function));
+            }
+            else
+            {
+                // Run reshape weights kernel and mark weights as unused
+                _reshape_weights_output.allocator()->allocate();
+                _reshape_weights_function.run();
+
+                cur_weights->mark_as_unused();
+                cur_weights = &_reshape_weights_output;
+            }
+            _are_weights_reshaped = true;
+        }
+
+        // Convert weights if needed (happens only once)
+        if(!_are_weights_converted)
+        {
+            if(_weights_manager && _weights_manager->are_weights_managed(cur_weights))
+            {
+                _weights_manager->run(cur_weights, &_convert_weights_managed);
+            }
+            else
+            {
+                _converted_weights_output.allocator()->allocate();
+                _convert_weights.run();
+                cur_weights->mark_as_unused();
+            }
+
+            _are_weights_converted = true;
+        }
+
+        // Release reshaped weights if unused
+        release_unused(&_reshape_weights_output);
+
+        // Prepare GEMM prepare and release unused weights
+        if(!_is_quantized)
+        {
+            _mm_gemm.prepare();
+        }
+
+        // Release converted weights if unused
+        release_unused(&_reshape_weights_output);
+        release_unused(&_converted_weights_output);
+
+        _is_prepared = true;
+    }
+#endif
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
index c6b166163..9aebc473e 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
@@ -16,13 +16,18 @@
 
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h"
 
+#include <arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h>
+#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
+#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h>
+
 using namespace arm_compute;
 
 void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *input,
                                                const arm_compute::ICLTensor *weights,
                                                const arm_compute::ICLTensor *biases,
                                                arm_compute::ICLTensor *output, bool needs_reshape,
-                                               const arm_compute::TensorShape &reshape)
+                                               const arm_compute::TensorShape &reshape,
+                                               KernelType kernel_type)
 {
   _input = input;
   _weights = weights;
@@ -30,6 +35,7 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
   _output = output;
   _needs_reshape = needs_reshape;
 
+  const ICLTensor *input_to_use = input;
   if (_needs_reshape)
   {
     // reshape
@@ -37,16 +43,44 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
                        _input->info()->clone()->set_tensor_shape(reshape).set_data_layout(
                            _input->info()->data_layout()));
     _cl_reshape.configure(_input, &_cl_buffer);
+    input_to_use = &_cl_buffer;
+  }
+
+  _cl_fc = [&]() {
+    if (kernel_type == KernelType::GENERAL)
+    {
+      auto fc = new arm_compute::CLFullyConnectedLayerEx{_memory_manager};
+      fc->configure(input_to_use, _weights, _biases, _output);
+      return std::unique_ptr<arm_compute::IFunction>(fc);
+    }
+    else
+    {
+      assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
+
+      bool is_hybrid = (input->info()->data_type() == DataType::F32 ||
+                        input->info()->data_type() == DataType::F16) &&
+                       weights->info()->data_type() == DataType::S8;
 
-    _cl_fc.configure(&_cl_buffer, _weights, _biases, _output);
+      if (is_hybrid)
+      {
+        auto fc = new arm_compute::CLFullyConnectedHybridLayer{_memory_manager};
+        fc->configure(input_to_use, _weights, _biases, _output);
+        return std::unique_ptr<arm_compute::IFunction>(fc);
+      }
+      else
+      {
+        auto fc = new arm_compute::CLFullyConnectedLayer{_memory_manager};
+        fc->configure(input_to_use, _weights, _biases, _output);
+        return std::unique_ptr<arm_compute::IFunction>(fc);
+      }
+    }
+  }();
 
+  if (_needs_reshape)
+  {
     // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
     _cl_buffer.allocator()->allocate();
   }
-  else
-  {
-    _cl_fc.configure(_input, _weights, _biases, _output);
-  }
 }
 
 void CLFullyConnectedReshapingLayer::run(void)
@@ -54,7 +88,7 @@ void CLFullyConnectedReshapingLayer::run(void)
   if (_needs_reshape)
     _cl_reshape.run();
 
-  _cl_fc.run();
+  _cl_fc->run();
 }
 
-void CLFullyConnectedReshapingLayer::prepare(void) { _cl_fc.prepare(); }
+void CLFullyConnectedReshapingLayer::prepare(void) { _cl_fc->prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp
new file mode 100644
index 000000000..ca5499dfc
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+
+namespace arm_compute
+{
+using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
+
+namespace
+{
+inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+{
+  return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run);
+}
+} // namespace
+
+CLGEMMLowpMatrixMultiplyCoreEx::CLGEMMLowpMatrixMultiplyCoreEx(
+    std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _mm_midgard_kernel(), _mtx_a_reduction_kernel(),
+      _mtx_b_reduction_kernel(), _vector_sum_col(), _vector_sum_row(), _a_offset(0), _b_offset(0),
+      _reshape_b_only_on_first_run(false), _is_prepared(false)
+{
+}
+
+void CLGEMMLowpMatrixMultiplyCoreEx::configure(const ICLTensor *a, const ICLTensor *b,
+                                               const ICLTensor *c, ICLTensor *output,
+                                               const GEMMInfo &gemm_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+  ARM_COMPUTE_UNUSED(c);
+  ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCoreEx::validate(
+      a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
+
+  _is_prepared = false;
+  _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+  _a_offset = a->info()->quantization_info().uniform().offset;
+  _b_offset = b->info()->quantization_info().uniform().offset;
+
+  // Get the GPU target
+  const GPUTarget gpu_target = CLScheduler::get().target();
+
+  // Set the target for the kernels
+  _mm_midgard_kernel.set_target(gpu_target);
+
+  // GEMMRHSMatrixInfo rhs_info;
+  // GEMMLHSMatrixInfo lhs_info;
+
+  // Arguments used by GEMMReshapeInfo
+  // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m,
+  // n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
+  // in order to know how the matrices have been reshaped
+  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+  const unsigned int m = reinterpret_input_as_3d
+                             ? (a->info()->dimension(1) * a->info()->dimension(2))
+                             : a->info()->dimension(1);
+  const unsigned int n = b->info()->dimension(0);
+  const unsigned int k = a->info()->dimension(0);
+  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+
+  const ICLTensor *matrix_b = b;
+  // Configure matrix multiply kernel
+  _mm_midgard_kernel.configure(
+      a, matrix_b, output,
+      GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+}
+
+Status CLGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b,
+                                                const ITensorInfo *c, const ITensorInfo *output,
+                                                const GEMMInfo &gemm_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+  ARM_COMPUTE_UNUSED(c);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(),
+                                  "Matrix A already reshaped is not supported");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(),
+                                  "Matrix B already reshaped is not supported");
+
+  const ITensorInfo *matrix_a_info = a;
+
+  // Get the GPU target
+  const GPUTarget gpu_target = CLScheduler::get().target();
+
+  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+  const unsigned int m =
+      reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+  const unsigned int n = b->dimension(0);
+  const unsigned int k = a->dimension(0);
+  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+
+  bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), gpu_target);
+
+  const GEMMReshapeInfo reshape_info =
+      GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+  TensorInfo weights_info(*b);
+  const ITensorInfo *matrix_b_info = &weights_info;
+  if (reshape_matrix_b)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(false,
+                                    "CLGEMMLowpMatrixMultiplyCoreEx does not support reshape_b");
+  }
+
+  // Validate matrix multiply
+  ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernelEx::validate(
+      matrix_a_info, matrix_b_info, output, reshape_info));
+
+  return Status{};
+}
+
+void CLGEMMLowpMatrixMultiplyCoreEx::run()
+{
+  prepare();
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Run matrix multiply
+  CLScheduler::get().enqueue(_mm_midgard_kernel, false);
+}
+
+void CLGEMMLowpMatrixMultiplyCoreEx::prepare()
+{
+  if (!_is_prepared)
+  {
+    _is_prepared = true;
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
index 6cad9bd2e..f594d7a2e 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/runtime/CL/functions/CLGatherEx.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
index 7180e9356..27ed8e828 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/runtime/CL/functions/CLHashtableLookup.h"
 
 #include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
index 86ea5a66d..80393e8d1 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h"
 
 #include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h"
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
index be35ea732..28e5bc0da 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/runtime/CL/functions/CLNeg.h"
 
 #include "arm_compute/core/CL/kernels/CLNegKernel.h"
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
index 38adedd10..fbb15ab1d 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/runtime/CL/functions/CLPReLU.h"
 
 #include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
index 2a34c0664..6049b7e70 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/runtime/CL/functions/CLRNNLayerEx.h"
 
 #include "arm_compute/core/Helpers.h"
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
index 13a25c901..8ce2d746c 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/runtime/CL/functions/CLReduceOperation.h"
 
 #include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
index c03826891..1f946d37b 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/runtime/CL/functions/CLSpaceToBatchND.h"
 
 #include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
index 0f455f96f..7d7b2264b 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h"
 
 #include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
index 80d50ad94..3ac95a8e6 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/runtime/CL/functions/CLTopKV2.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
index 40e21671d..e61746ef2 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
index 0ce3e6700..07feb5a64 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +13,37 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
 
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/CL/ICLTensor.h"
 
 #include <cmath>
 #include <memory>
@@ -54,7 +79,7 @@ void CLTransposeConvLayerUpsample::run()
   _output->map(CLScheduler::get().queue(), true);
   if (is_data_type_quantized_asymmetric(_output->info()->data_type()))
   {
-    const uint8_t quantized_zero = _output->info()->quantization_info().offset;
+    const uint8_t quantized_zero = _output->info()->quantization_info().uniform().offset;
     std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
   }
   else
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
index a95018a28..5405934ad 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
@@ -1,6 +1,21 @@
 /*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,23 +37,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h"
 
-#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h"
+#include "arm_compute/runtime/CPP/functions/CPPOneHotEx.h"
+
+#include "arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h"
 #include "support/ToolchainSupport.h"
 
-#include <utility>
+using namespace arm_compute;
 
-namespace arm_compute
-{
-void NENegLayer::configure(const ITensor *input, ITensor *output)
+void CPPOneHotEx::configure(const ITensor *indices, ITensor *output, const int depth,
+                            const float on_value, const float off_value, const int axis)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernelEx>();
-  k->configure(ElementWiseUnaryEx::NEG, input, output);
+  auto k = arm_compute::support::cpp14::make_unique<CPPOneHotKernelEx>();
+  k->configure(indices, output, depth, on_value, off_value, axis);
   _kernel = std::move(k);
 }
-Status NENegLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-  return NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx::NEG, input, output);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
index f8e0ef8a6..6c90ef3b4 100644
--- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h"
 
 #include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
new file mode 100644
index 000000000..ff81ff854
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEActivationLayerEx.h"
+
+#include "arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h"
+#include "arm_compute/runtime/IRuntimeContext.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+NEActivationLayerEx::NEActivationLayerEx(IRuntimeContext *ctx) // NOLINT
+    : INESimpleFunctionNoBorder(ctx)
+{
+}
+void NEActivationLayerEx::configure(ITensor *input, ITensor *output,
+                                    ActivationLayerInfo activation_info)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEActivationLayerKernelEx>();
+  k->configure(input, output, activation_info);
+  _kernel = std::move(k);
+}
+
+Status NEActivationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                     const ActivationLayerInfo &act_info)
+{
+  return NEActivationLayerKernelEx::validate(input, output, act_info);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp
deleted file mode 100644
index 5ba465b61..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEArgMinMax.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-namespace arm_compute
-{
-
-template <ReductionOperation OP>
-NEArgMinMaxStatic<OP>::NEArgMinMaxStatic(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernel(), _reduced_out(), _reshape()
-{
-}
-
-template <ReductionOperation OP>
-Status NEArgMinMaxStatic<OP>::validate(const ITensorInfo *input, int axis,
-                                       const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
-                                                       DataType::F32);
-
-  TensorShape out_shape = input->tensor_shape();
-  const int input_dims = input->num_dimensions();
-  int axis_local = axis;
-
-  // Convert negative axis
-  axis_local = wrap_around(axis_local, input_dims);
-
-  ARM_COMPUTE_RETURN_ERROR_ON(axis_local > 3);
-  ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local) > input->num_dimensions() - 1);
-  out_shape.remove_dimension(axis_local);
-
-  const TensorInfo out_info = output->clone()->set_tensor_shape(out_shape);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-
-  return Status{};
-}
-
-template <ReductionOperation OP>
-void NEArgMinMaxStatic<OP>::configure(ITensor *input, int axis, ITensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-
-  int axis_local = axis;
-  const int input_dims = input->info()->num_dimensions();
-
-  // Convert negative axis
-  axis_local = wrap_around(axis_local, input_dims);
-
-  // Perform reduction for axis
-  TensorShape intermediate_shape = input->info()->tensor_shape();
-  intermediate_shape.set(axis_local, 1);
-  auto in = input;
-
-  _reduced_out.allocator()->init(TensorInfo(intermediate_shape, output->info()->num_channels(),
-                                            output->info()->data_type(),
-                                            output->info()->quantization_info()));
-  _memory_group.manage(&_reduced_out);
-  _reduction_kernel.configure(in, axis_local, &_reduced_out, OP);
-
-  // Allocate intermediate tensor
-  _reduced_out.allocator()->allocate();
-
-  // Configure reshape layer if we want to drop the dimensions
-  TensorShape out_shape = input->info()->tensor_shape();
-  out_shape.remove_dimension(axis_local);
-  auto_init_if_empty(*output->info(), output->info()->clone()->set_tensor_shape(out_shape));
-  _reshape.configure(&_reduced_out, output);
-}
-
-template <ReductionOperation OP> void NEArgMinMaxStatic<OP>::run()
-{
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  _reduction_kernel.run();
-  _reshape.run();
-}
-
-// Supported Specializations
-template class NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MAX>;
-template class NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MIN>;
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
index 7c15fc453..e42c453cf 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h"
 #include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h>
 
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
index f2490e4e8..dc5c62061 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
index db419e3a8..5ec0b8677 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
index 00c3ed94f..53fb15081 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h"
 
 #include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
index d604fedbf..f45773251 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h"
 
 #include "arm_compute/core/Helpers.h"
@@ -154,7 +170,7 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
 
   // Multiply scale
   _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output,
-                                   weights->info()->quantization_info().scale);
+                                   weights->info()->quantization_info().uniform().scale);
 
   _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
 
@@ -220,7 +236,7 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output));
 
   ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate(
-      &gemmlowp_output, &scale_factor, output, weights->quantization_info().scale));
+      &gemmlowp_output, &scale_factor, output, weights->quantization_info().uniform().scale));
 
   return Status{};
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
index a944f699a..cb7557a5a 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h"
 
 #include "arm_compute/core/Helpers.h"
@@ -46,10 +62,10 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
     // Since we need negative offsets for computing convolution, we need to change
     // QuantizationInfo()
     // Extract and negate input and weights offset
-    const QuantizationInfo input_quantization_info(input.quantization_info().scale,
-                                                   -input.quantization_info().offset);
-    const QuantizationInfo weights_quantization_info(weights.quantization_info().scale,
-                                                     -weights.quantization_info().offset);
+    const QuantizationInfo input_quantization_info(input.quantization_info().uniform().scale,
+                                                   -input.quantization_info().uniform().offset);
+    const QuantizationInfo weights_quantization_info(weights.quantization_info().uniform().scale,
+                                                     -weights.quantization_info().uniform().offset);
 
     // Validate gemmlowp function
     ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(
@@ -88,10 +104,10 @@ void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *
     const QuantizationInfo input_quantization_info = input->info()->quantization_info();
     const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
 
-    input->info()->set_quantization_info(
-        QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
-    weights->info()->set_quantization_info(
-        QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+    input->info()->set_quantization_info(QuantizationInfo(
+        input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+    weights->info()->set_quantization_info(QuantizationInfo(
+        weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
     // Configure gemmlowp function
     _mm_gemmlowp.configure(input, weights, nullptr, output);
@@ -236,15 +252,16 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei
   // Configure output stage for asymmetric quantized types
   if (_is_quantized)
   {
-    float multiplier = input->info()->quantization_info().scale *
-                       weights->info()->quantization_info().scale /
-                       output->info()->quantization_info().scale;
+    float multiplier = input->info()->quantization_info().uniform().scale *
+                       weights->info()->quantization_info().uniform().scale /
+                       output->info()->quantization_info().uniform().scale;
     int output_multiplier;
     int output_shift;
     quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier,
                                                                &output_shift);
     _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier,
-                                     output_shift, output->info()->quantization_info().offset);
+                                     output_shift,
+                                     output->info()->quantization_info().uniform().offset);
     _gemmlowp_output.allocator()->allocate();
   }
 
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
index 11794a1ea..1290cfd39 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
 
 #include "arm_compute/core/Error.h"
@@ -50,7 +66,7 @@ NEGEMMLowpMatrixMultiplyCoreEx::NEGEMMLowpMatrixMultiplyCoreEx(
       _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0),
       _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),
       _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false),
-      _fuse_output_stage(false), _run_activation(false), _flip_signedness(false)
+      _fuse_output_stage(false), _flip_signedness(false)
 {
 }
 
@@ -71,8 +87,8 @@ void NEGEMMLowpMatrixMultiplyCoreEx::configure(const ITensor *a, const ITensor *
   _mtx_b_reshape_kernel = nullptr;
 
   // Set internal variables
-  _a_offset = a->info()->quantization_info().offset;
-  _b_offset = b->info()->quantization_info().offset;
+  _a_offset = a->info()->quantization_info().uniform().offset;
+  _b_offset = b->info()->quantization_info().uniform().offset;
   _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
   _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
   _is_prepared = false;
@@ -91,7 +107,6 @@ void NEGEMMLowpMatrixMultiplyCoreEx::configure(const ITensor *a, const ITensor *
   }
 
 #ifdef __aarch64__
-#if 0  // Can use after arm compute library v19.11
   switch (a->info()->data_type())
   {
     case DataType::QASYMM8:
@@ -119,8 +134,6 @@ void NEGEMMLowpMatrixMultiplyCoreEx::configure(const ITensor *a, const ITensor *
       break;
     }
   }
-#endif // 0
-  ARM_COMPUTE_ERROR("aarch64 not supported");
 #endif /* __aarch64__ */
   if (!(_assembly_path || _run_vector_matrix_multiplication))
   {
@@ -277,8 +290,8 @@ Status NEGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITen
   TensorInfo tmp_b_info{};
   TensorInfo mm_result_s32_info{};
 
-  int32_t a_offset = a->quantization_info().offset;
-  int32_t b_offset = b->quantization_info().offset;
+  int32_t a_offset = a->quantization_info().uniform().offset;
+  int32_t b_offset = b->quantization_info().uniform().offset;
 
   bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
   if (fuse_output_stage)
@@ -291,19 +304,16 @@ Status NEGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITen
   // Check if we need to run the optimized assembly kernel
   bool run_optimised = false;
   bool run_optimised_requantized = false;
-  const bool reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
   if (a_to_use->data_type() == DataType::QASYMM8 &&
       info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
   {
-    run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, output, 1.f, 0.f,
-                                                          reshape_b_only_on_first_run));
+    run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
     run_optimised_requantized = run_optimised;
   }
   else
   {
     run_optimised = bool(NEGEMMAssemblyDispatch::validate(
-        a_to_use, b, fuse_output_stage ? &mm_result_s32_info : output, 1.f, 0.f,
-        reshape_b_only_on_first_run));
+        a_to_use, b, c, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
   }
 
   if (run_optimised)
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
index 90dabb35a..c8bb88aea 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/runtime/NEON/functions/NEGatherEx.h"
 
 #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
index 624185d2c..078019f4e 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
@@ -1,6 +1,5 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2016-2018 ARM Limited.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +13,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 #include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h"
 
 #include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
index 1c2c8f027..16d74e62d 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h"
 
 #include "arm_compute/core/Helpers.h"
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
index 1150cef76..dac3b849d 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/runtime/NEON/functions/NEPReLU.h"
 
 #include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
index 84411c266..0e9a5e969 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
index c65e93570..116bba3c0 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/runtime/NEON/functions/NEReduceMeanEx.h"
 
 #include "arm_compute/core/Helpers.h"
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
index b36f8287a..aedb537e9 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,11 +37,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/runtime/NEON/functions/NEReduceOperation.h"
 
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/Tensor.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
index 3c18217ef..26a887912 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/runtime/NEON/functions/NEReduceSum.h"
 
 #include "arm_compute/core/CPP/Validate.h"
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp
index c3431c418..2aa0d2d4b 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h"
 
 #include "arm_compute/core/Helpers.h"
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
index c9f914fb0..198bb7672 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
index b6ae21cc0..97697e3ea 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
index fd15ef05f..df0689273 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
@@ -1,5 +1,20 @@
 /*
  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
  * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
@@ -22,6 +37,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h"
 
 #include "arm_compute/core/Helpers.h"
author	Chunseok Lee <chunseok.lee@samsung.com>	2020-04-23 14:45:49 +0900
committer	Chunseok Lee <chunseok.lee@samsung.com>	2020-04-23 14:45:49 +0900
commit	e2ef8438a24f7c56a0744eb579a6e293ee2fbf8e (patch)
tree	44a1a7951d168dd4370e13593ed03f4bc6d920c5 /compute/ARMComputeEx/src
parent	302e6564a7a76109e1178207e44e45a58631c477 (diff)
download	nnfw-e2ef8438a24f7c56a0744eb579a6e293ee2fbf8e.tar.gz nnfw-e2ef8438a24f7c56a0744eb579a6e293ee2fbf8e.tar.bz2 nnfw-e2ef8438a24f7c56a0744eb579a6e293ee2fbf8e.zip