arm_compute v17.12

author: Anthony Barbier <Anthony.barbier@arm.com> 2017-12-14 23:48:46 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-01-24 10:01:21 +0000
commit: 8140e1e155d3430992fa46e04ef8938ff09ffd2d (patch)
tree: 9bcf86d01635bfc73e8debd1bda75e6f75b8b406 /arm_compute
parent: 8a3da6f91f90c566b844d568f4ec43b946915af8 (diff)
download: armcl-8140e1e155d3430992fa46e04ef8938ff09ffd2d.tar.gz
armcl-8140e1e155d3430992fa46e04ef8938ff09ffd2d.tar.bz2
armcl-8140e1e155d3430992fa46e04ef8938ff09ffd2d.zip
327 files changed, 18642 insertions, 1411 deletions
diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
index 1a4476e30..365ecb06c 100644
--- a/arm_compute/core/CL/CLHelpers.h
+++ b/arm_compute/core/CL/CLHelpers.h
@@ -43,7 +43,7 @@ struct enable_bitwise_ops<arm_compute::GPUTarget>
 };
 
 /** Max vector width of an OpenCL vector */
-static constexpr const unsigned int max_cl_vector_width = 16;
+static constexpr unsigned int max_cl_vector_width = 16;
 
 /** Translates a tensor data type to the appropriate OpenCL type.
  *
@@ -126,6 +126,13 @@ GPUTarget get_arch_from_target(GPUTarget target);
  * @return the highest OpenCL version supported
  */
 CLVersion get_cl_version(const cl::Device &device);
+/** Helper function to check whether the cl_khr_fp16 extension is supported
+ *
+ * @param[in] device A CL device
+ *
+ * @return True if the extension is supported
+ */
+bool fp16_support(const cl::Device &device);
 /** Helper function to check whether the arm_non_uniform_work_group_size extension is supported
  *
  * @param[in] device A CL device
diff --git a/arm_compute/core/CL/CLKernelLibrary.h b/arm_compute/core/CL/CLKernelLibrary.h
index fc131cdcf..25c7f75ba 100644
--- a/arm_compute/core/CL/CLKernelLibrary.h
+++ b/arm_compute/core/CL/CLKernelLibrary.h
@@ -33,6 +33,52 @@
 
 namespace arm_compute
 {
+/** Build options */
+class CLBuildOptions
+{
+    using StringSet = std::set<std::string>;
+
+public:
+    /** Default constructor. */
+    CLBuildOptions();
+    /** Adds option to the existing build option list
+     *
+     * @param[in] option Option to add
+     */
+    void add_option(std::string option);
+    /** Adds option if a given condition is true;
+     *
+     * @param[in] cond   Condition to check
+     * @param[in] option Option to add if condition is true
+     */
+    void add_option_if(bool cond, std::string option);
+    /** Adds first option if condition is true else the second one
+     *
+     * @param[in] cond         Condition to check
+     * @param[in] option_true  Option to add if condition is true
+     * @param[in] option_false Option to add if condition is false
+     */
+    void add_option_if_else(bool cond, std::string option_true, std::string option_false);
+    /** Appends given build options to the current's objects options.
+     *
+     * @param[in] options Build options to append
+     */
+    void add_options(const StringSet &options);
+    /** Appends given build options to the current's objects options if a given condition is true.
+     *
+     * @param[in] cond    Condition to check
+     * @param[in] options Option to add if condition is true
+     */
+    void add_options_if(bool cond, const StringSet &options);
+    /** Gets the current options list set
+     *
+     * @return Build options set
+     */
+    const StringSet &options() const;
+
+private:
+    StringSet _build_opts; /**< Build options set */
+};
 /** Program class */
 class Program
 {
@@ -181,8 +227,8 @@ public:
         return _kernel_path;
     };
     /** Gets the source of the selected program
-      *
-      * @param[in] program_name Program name.
+     *
+     * @param[in] program_name Program name.
      */
     std::string get_program_source(const std::string &program_name);
     /** Sets the CL context used to create programs.
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index 8da0cecad..9da0e5ab3 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -42,9 +42,9 @@
 #include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
 #include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
 #include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseVectorToTensorKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h"
@@ -58,6 +58,10 @@
 #include "arm_compute/core/CL/kernels/CLFloorKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
@@ -72,7 +76,7 @@
 #include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
 #include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
 #include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
-#include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h"
+#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
 #include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h
index 9119940bc..a1bc3eb8d 100644
--- a/arm_compute/core/CL/ICLKernel.h
+++ b/arm_compute/core/CL/ICLKernel.h
@@ -180,6 +180,13 @@ public:
      * @return The maximum workgroup size value.
      */
     size_t get_max_workgroup_size();
+    /** Get the global work size given an execution window
+     *
+     * @param[in] window Execution window
+     *
+     * @return Global work size of the given execution window
+     */
+    static cl::NDRange gws_from_window(const Window &window);
 
 private:
     /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
diff --git a/arm_compute/core/CL/ICLMultiHOG.h b/arm_compute/core/CL/ICLMultiHOG.h
index 9f3c77523..90082a611 100644
--- a/arm_compute/core/CL/ICLMultiHOG.h
+++ b/arm_compute/core/CL/ICLMultiHOG.h
@@ -35,14 +35,14 @@ class ICLMultiHOG : public IMultiHOG
 public:
     /** Return a pointer to the requested OpenCL HOG model
      *
-     *  @param[in] index The index of the wanted OpenCL HOG model.
+     * @param[in] index The index of the wanted OpenCL HOG model.
      *
      *  @return A pointer pointed to the HOG model
      */
     virtual ICLHOG *cl_model(size_t index) = 0;
     /** Return a constant pointer to the requested OpenCL HOG model
      *
-     *  @param[in] index The index of the wanted OpenCL HOG model.
+     * @param[in] index The index of the wanted OpenCL HOG model.
      *
      *  @return A constant pointer pointed to the OpenCL HOG model
      */
diff --git a/arm_compute/core/CL/ICLMultiImage.h b/arm_compute/core/CL/ICLMultiImage.h
index e8705b182..774175607 100644
--- a/arm_compute/core/CL/ICLMultiImage.h
+++ b/arm_compute/core/CL/ICLMultiImage.h
@@ -37,14 +37,14 @@ class ICLMultiImage : public IMultiImage
 public:
     /** Return a pointer to the requested OpenCL plane of the image.
      *
-     *  @param[in] index The index of the wanted planed.
+     * @param[in] index The index of the wanted planed.
      *
      *  @return A pointer pointed to the OpenCL plane
      */
     virtual ICLImage *cl_plane(unsigned int index) = 0;
     /** Return a constant pointer to the requested OpenCL plane of the image.
      *
-     *  @param[in] index The index of the wanted planed.
+     * @param[in] index The index of the wanted planed.
      *
      *  @return A constant pointer pointed to the OpenCL plane
      */
diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
index 6780e23c2..8a2d30bb8 100644
--- a/arm_compute/core/CL/OpenCL.h
+++ b/arm_compute/core/CL/OpenCL.h
@@ -54,69 +54,42 @@ public:
     bool load(const std::string &library);
     bool load_default();
 
-    using clBuildProgram_func            = cl_int (*)(cl_program, cl_uint, const cl_device_id *, const char *, void (*pfn_notify)(cl_program, void *), void *);
-    using clEnqueueNDRangeKernel_func    = cl_int (*)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
-    using clSetKernelArg_func            = cl_int (*)(cl_kernel, cl_uint, size_t, const void *);
-    using clRetainMemObject_func         = cl_int (*)(cl_mem);
-    using clReleaseMemObject_func        = cl_int (*)(cl_mem);
-    using clEnqueueUnmapMemObject_func   = cl_int (*)(cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *);
-    using clRetainCommandQueue_func      = cl_int (*)(cl_command_queue command_queue);
-    using clReleaseContext_func          = cl_int (*)(cl_context);
-    using clReleaseEvent_func            = cl_int (*)(cl_event);
-    using clEnqueueWriteBuffer_func      = cl_int (*)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
-    using clEnqueueReadBuffer_func       = cl_int (*)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
-    using clGetProgramBuildInfo_func     = cl_int (*)(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *);
-    using clRetainProgram_func           = cl_int (*)(cl_program program);
-    using clEnqueueMapBuffer_func        = void *(*)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event *, cl_event *, cl_int *);
-    using clReleaseCommandQueue_func     = cl_int (*)(cl_command_queue);
-    using clCreateProgramWithBinary_func = cl_program (*)(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *);
-    using clRetainContext_func           = cl_int (*)(cl_context context);
-    using clReleaseProgram_func          = cl_int (*)(cl_program program);
-    using clFlush_func                   = cl_int (*)(cl_command_queue command_queue);
-    using clFinish_func                  = cl_int (*)(cl_command_queue command_queue);
-    using clGetProgramInfo_func          = cl_int (*)(cl_program, cl_program_info, size_t, void *, size_t *);
-    using clCreateKernel_func            = cl_kernel (*)(cl_program, const char *, cl_int *);
-    using clRetainKernel_func            = cl_int (*)(cl_kernel kernel);
-    using clCreateBuffer_func            = cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *);
-    using clCreateProgramWithSource_func = cl_program (*)(cl_context, cl_uint, const char **, const size_t *, cl_int *);
-    using clReleaseKernel_func           = cl_int (*)(cl_kernel kernel);
-    using clGetDeviceInfo_func           = cl_int (*)(cl_device_id, cl_device_info, size_t, void *, size_t *);
-    using clGetDeviceIDs_func            = cl_int (*)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *);
-    using clRetainEvent_func             = cl_int (*)(cl_event);
-    using clGetPlatformIDs_func          = cl_int (*)(cl_uint, cl_platform_id *, cl_uint *);
-    using clGetKernelWorkGroupInfo_func  = cl_int (*)(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *);
+#define DECLARE_FUNCTION_PTR(func_name) \
+    std::function<decltype(func_name)> func_name##_ptr = nullptr
 
-    clBuildProgram_func            clBuildProgram            = nullptr;
-    clEnqueueNDRangeKernel_func    clEnqueueNDRangeKernel    = nullptr;
-    clSetKernelArg_func            clSetKernelArg            = nullptr;
-    clReleaseKernel_func           clReleaseKernel           = nullptr;
-    clCreateProgramWithSource_func clCreateProgramWithSource = nullptr;
-    clCreateBuffer_func            clCreateBuffer            = nullptr;
-    clRetainKernel_func            clRetainKernel            = nullptr;
-    clCreateKernel_func            clCreateKernel            = nullptr;
-    clGetProgramInfo_func          clGetProgramInfo          = nullptr;
-    clFlush_func                   clFlush                   = nullptr;
-    clFinish_func                  clFinish                  = nullptr;
-    clReleaseProgram_func          clReleaseProgram          = nullptr;
-    clRetainContext_func           clRetainContext           = nullptr;
-    clCreateProgramWithBinary_func clCreateProgramWithBinary = nullptr;
-    clReleaseCommandQueue_func     clReleaseCommandQueue     = nullptr;
-    clEnqueueMapBuffer_func        clEnqueueMapBuffer        = nullptr;
-    clRetainProgram_func           clRetainProgram           = nullptr;
-    clGetProgramBuildInfo_func     clGetProgramBuildInfo     = nullptr;
-    clEnqueueReadBuffer_func       clEnqueueReadBuffer       = nullptr;
-    clEnqueueWriteBuffer_func      clEnqueueWriteBuffer      = nullptr;
-    clReleaseEvent_func            clReleaseEvent            = nullptr;
-    clReleaseContext_func          clReleaseContext          = nullptr;
-    clRetainCommandQueue_func      clRetainCommandQueue      = nullptr;
-    clEnqueueUnmapMemObject_func   clEnqueueUnmapMemObject   = nullptr;
-    clRetainMemObject_func         clRetainMemObject         = nullptr;
-    clReleaseMemObject_func        clReleaseMemObject        = nullptr;
-    clGetDeviceInfo_func           clGetDeviceInfo           = nullptr;
-    clGetDeviceIDs_func            clGetDeviceIDs            = nullptr;
-    clRetainEvent_func             clRetainEvent             = nullptr;
-    clGetPlatformIDs_func          clGetPlatformIDs          = nullptr;
-    clGetKernelWorkGroupInfo_func  clGetKernelWorkGroupInfo  = nullptr;
+    DECLARE_FUNCTION_PTR(clBuildProgram);
+    DECLARE_FUNCTION_PTR(clEnqueueNDRangeKernel);
+    DECLARE_FUNCTION_PTR(clSetKernelArg);
+    DECLARE_FUNCTION_PTR(clReleaseKernel);
+    DECLARE_FUNCTION_PTR(clCreateProgramWithSource);
+    DECLARE_FUNCTION_PTR(clCreateBuffer);
+    DECLARE_FUNCTION_PTR(clRetainKernel);
+    DECLARE_FUNCTION_PTR(clCreateKernel);
+    DECLARE_FUNCTION_PTR(clGetProgramInfo);
+    DECLARE_FUNCTION_PTR(clFlush);
+    DECLARE_FUNCTION_PTR(clFinish);
+    DECLARE_FUNCTION_PTR(clReleaseProgram);
+    DECLARE_FUNCTION_PTR(clRetainContext);
+    DECLARE_FUNCTION_PTR(clCreateProgramWithBinary);
+    DECLARE_FUNCTION_PTR(clReleaseCommandQueue);
+    DECLARE_FUNCTION_PTR(clEnqueueMapBuffer);
+    DECLARE_FUNCTION_PTR(clRetainProgram);
+    DECLARE_FUNCTION_PTR(clGetProgramBuildInfo);
+    DECLARE_FUNCTION_PTR(clEnqueueReadBuffer);
+    DECLARE_FUNCTION_PTR(clEnqueueWriteBuffer);
+    DECLARE_FUNCTION_PTR(clReleaseEvent);
+    DECLARE_FUNCTION_PTR(clReleaseContext);
+    DECLARE_FUNCTION_PTR(clRetainCommandQueue);
+    DECLARE_FUNCTION_PTR(clEnqueueUnmapMemObject);
+    DECLARE_FUNCTION_PTR(clRetainMemObject);
+    DECLARE_FUNCTION_PTR(clReleaseMemObject);
+    DECLARE_FUNCTION_PTR(clGetDeviceInfo);
+    DECLARE_FUNCTION_PTR(clGetDeviceIDs);
+    DECLARE_FUNCTION_PTR(clRetainEvent);
+    DECLARE_FUNCTION_PTR(clGetPlatformIDs);
+    DECLARE_FUNCTION_PTR(clGetKernelWorkGroupInfo);
+
+#undef DECLARE_FUNCTION_PTR
 
 private:
     std::pair<bool, bool> _loaded{ false, false };
diff --git a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
index dab133f05..5b6c44cdd 100644
--- a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
@@ -56,6 +56,16 @@ public:
      * @param[in]      act_info Activation layer information.
      */
     void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayerKernel
+     *
+     * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
+     *                     of the activation function. Data types supported: QS8/QS16/F16/F32.
+     * @param[in] output   Destination tensor info. Data type supported: same as @p input
+     * @param[in] act_info Activation layer information.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h
index 0895fe3f7..96b8dc8d4 100644
--- a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h
+++ b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h
@@ -59,6 +59,16 @@ public:
      * @param[in]  policy Policy to use to handle overflow.
      */
     void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticAdditionKernel
+     *
+     * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32.
+     * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32.
+     * @param[in] policy Policy to use to handle overflow.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h
index d7755d5e3..c5f862a61 100644
--- a/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h
+++ b/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h
@@ -61,6 +61,16 @@ public:
      * @param[in]  policy Policy to use to handle overflow.
      */
     void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticSubtractionKernel
+     *
+     * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32.
+     * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32.
+     * @param[in] policy Policy to use to handle overflow.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
index 26825efba..8643d83bc 100644
--- a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
@@ -55,14 +55,32 @@ public:
      * @param[in, out] input   Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
      *                         3 lower dimensions represent a single input with dimensions [width, height, FM].
      *                         The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32.
+     * @param[out]     output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
      * @param[in]      mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
      * @param[in]      var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]      gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
      * @param[in]      beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]      gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
      * @param[in]      epsilon Small value to avoid division with zero.
-     * @param[out]     output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
      */
     void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayerKernel
+     *
+     * @param[in] input   Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
+     *                    3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                    The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32.
+     * @param[in] output  Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     * @param[in] mean    Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] var     Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] beta    Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] gamma   Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] epsilon Small value to avoid division with zero.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                           const ITensorInfo *mean, const ITensorInfo *var,
+                           const ITensorInfo *beta, const ITensorInfo *gamma,
+                           float epsilon);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLChannelExtractKernel.h b/arm_compute/core/CL/kernels/CLChannelExtractKernel.h
index 404b2d144..96ce44220 100644
--- a/arm_compute/core/CL/kernels/CLChannelExtractKernel.h
+++ b/arm_compute/core/CL/kernels/CLChannelExtractKernel.h
@@ -53,14 +53,14 @@ public:
     ~CLChannelExtractKernel() = default;
     /** Set the input and output of the kernel
      *
-     * @param[in]  input   Source tensor.
+     * @param[in]  input   Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
      * @param[in]  channel Channel to extract.
      * @param[out] output  Destination tensor. Must be of U8 format.
      */
     void configure(const ICLTensor *input, Channel channel, ICLTensor *output);
     /** Set the input and output of the kernel
      *
-     * @param[in]  input   Multi-planar source image.
+     * @param[in]  input   Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
      * @param[in]  channel Channel to extract.
      * @param[out] output  Single-planar 2D destination image. Must be of U8 format.
      */
diff --git a/arm_compute/core/CL/kernels/CLCol2ImKernel.h b/arm_compute/core/CL/kernels/CLCol2ImKernel.h
index 807748cfd..bd86da1b5 100644
--- a/arm_compute/core/CL/kernels/CLCol2ImKernel.h
+++ b/arm_compute/core/CL/kernels/CLCol2ImKernel.h
@@ -66,7 +66,7 @@ public:
 
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input          The input tensor to convert. Data types supported: QS8/QS16/F16/F32
+     * @param[in]  input          The input tensor to convert. Data types supported: QS8/QS16/QASYMM8/F16/F32
      * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
      *                            while the rest represent batch of outputs. Data types supported: Same as @p input
      * @param[in]  convolved_dims Output convolved dimensions.
diff --git a/arm_compute/core/CL/kernels/CLColorConvertKernel.h b/arm_compute/core/CL/kernels/CLColorConvertKernel.h
index 23f1c56c6..edd05ef00 100644
--- a/arm_compute/core/CL/kernels/CLColorConvertKernel.h
+++ b/arm_compute/core/CL/kernels/CLColorConvertKernel.h
@@ -53,26 +53,27 @@ public:
 
     /** Set the input and output of the kernel
      *
-     * @param[in]  input  Source tensor
-     * @param[out] output Destination tensor
+     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
+     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
+     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/)
      */
     void configure(const ICLTensor *input, ICLTensor *output);
     /** Set the input and output of the kernel
      *
-     * @param[in]  input  multi-planar source image
-     * @param[out] output single-planar destination image
+     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
      */
     void configure(const ICLMultiImage *input, ICLImage *output);
     /** Set the input and output of the kernel
      *
-     * @param[in]  input  single-planar source image
-     * @param[out] output multi-planar destination image
+     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
+     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
      */
     void configure(const ICLImage *input, ICLMultiImage *output);
     /** Set the input and output of the kernel
      *
-     * @param[in]  input  multi-planar source image
-     * @param[out] output multi-planar destination image
+     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
      */
     void configure(const ICLMultiImage *input, ICLMultiImage *output);
 
diff --git a/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h b/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h
index 2833d8ec2..467bdfab3 100644
--- a/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h
@@ -35,21 +35,21 @@ class ICLTensor;
 /** Interface for the depth concatenate kernel.
  *  The input tensor will be concatenated into the output tensor.
  */
-class CLDepthConcatenateKernel : public ICLKernel
+class CLDepthConcatenateLayerKernel : public ICLKernel
 {
 public:
     /** Default constructor */
-    CLDepthConcatenateKernel();
+    CLDepthConcatenateLayerKernel();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthConcatenateKernel(const CLDepthConcatenateKernel &) = delete;
+    CLDepthConcatenateLayerKernel(const CLDepthConcatenateLayerKernel &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthConcatenateKernel &operator=(const CLDepthConcatenateKernel &) = delete;
+    CLDepthConcatenateLayerKernel &operator=(const CLDepthConcatenateLayerKernel &) = delete;
     /** Allow instances of this class to be moved */
-    CLDepthConcatenateKernel(CLDepthConcatenateKernel &&) = default;
+    CLDepthConcatenateLayerKernel(CLDepthConcatenateLayerKernel &&) = default;
     /** Allow instances of this class to be moved */
-    CLDepthConcatenateKernel &operator=(CLDepthConcatenateKernel &&) = default;
+    CLDepthConcatenateLayerKernel &operator=(CLDepthConcatenateLayerKernel &&) = default;
     /** Default destructor */
-    ~CLDepthConcatenateKernel() = default;
+    ~CLDepthConcatenateLayerKernel() = default;
     /** Initialise the kernel's inputs and output
      *
      * @param[in]     input        Input tensor. Data types supported: QS8/QS16/F16/F32.
diff --git a/arm_compute/core/CL/kernels/CLDepthConvertKernel.h b/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h
index da70bff0f..3a6310d69 100644
--- a/arm_compute/core/CL/kernels/CLDepthConvertKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h
@@ -36,7 +36,7 @@ class ICLTensor;
 /** Interface for the depth conversion kernel.
  *
  */
-class CLDepthConvertKernel : public ICLSimple2DKernel
+class CLDepthConvertLayerKernel : public ICLSimple2DKernel
 {
 public:
     /** Set the input and output of the kernel.
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h
index 4e69f551b..eb62465f8 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h
@@ -32,27 +32,29 @@ class ICLTensor;
 
 /** Interface for the kernel to run a 3x3 depthwise convolution on a tensor.
  */
-class CLDepthwiseConvolution3x3Kernel : public ICLKernel
+class CLDepthwiseConvolutionLayer3x3Kernel : public ICLKernel
 {
 public:
     /** Default constructor */
-    CLDepthwiseConvolution3x3Kernel();
+    CLDepthwiseConvolutionLayer3x3Kernel();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolution3x3Kernel(const CLDepthwiseConvolution3x3Kernel &) = delete;
+    CLDepthwiseConvolutionLayer3x3Kernel(const CLDepthwiseConvolutionLayer3x3Kernel &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolution3x3Kernel &operator=(const CLDepthwiseConvolution3x3Kernel &) = delete;
+    CLDepthwiseConvolutionLayer3x3Kernel &operator=(const CLDepthwiseConvolutionLayer3x3Kernel &) = delete;
     /** Default Move Constructor. */
-    CLDepthwiseConvolution3x3Kernel(CLDepthwiseConvolution3x3Kernel &&) = default;
+    CLDepthwiseConvolutionLayer3x3Kernel(CLDepthwiseConvolutionLayer3x3Kernel &&) = default;
     /** Default move assignment operator. */
-    CLDepthwiseConvolution3x3Kernel &operator=(CLDepthwiseConvolution3x3Kernel &&) = default;
+    CLDepthwiseConvolutionLayer3x3Kernel &operator=(CLDepthwiseConvolutionLayer3x3Kernel &&) = default;
     /** Initialize the function's source, destination, conv and border_size.
      *
-     * @param[in]  input     Source tensor. DataType supported: F32.
+     * @param[in]  input     Source tensor. DataType supported: QASYMM8/F32.
+     * @param[in]  weights   Weights tensor. A 3D tensor with dimensions [3, 3, IFM]. Data type supported: Same as @p input.
+     * @param[in]  biases    (Optional) Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                       Data type supported: Same as @p input.
      * @param[out] output    Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  weights   Weights tensor. These are 3D tensors with dimensions [3, 3, IFM]. Data type supported: Same as @p input.
      * @param[in]  conv_info Padding and stride information to use for the convolution.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info);
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -63,10 +65,11 @@ private:
     const ICLTensor *_input;
     ICLTensor       *_output;
     const ICLTensor *_weights;
+    const ICLTensor *_biases;
     unsigned int     _conv_stride_x;
     unsigned int     _conv_stride_y;
-    unsigned int     _conv_pad_x;
-    unsigned int     _conv_pad_y;
+    unsigned int     _conv_pad_left;
+    unsigned int     _conv_pad_top;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLDEPTHWISECONVOLUTIONKERNEL3x3_H__ */
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h
index ae56adfa3..7e786e8df 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h
@@ -56,8 +56,9 @@ public:
      *                         while every dimension above 3 represents a batch. Data types supported: Same as @p input
      * @param[in]  kernel_dims The kernel dimensions (width and height).
      * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  has_bias    Boolean that specifies if the depthwise convolution has bias.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info);
+    void configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias = false);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h
index d493d9f05..7989257d3 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h
@@ -52,14 +52,16 @@ public:
      *
      * @param[in]  input  The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: F32.
      * @param[out] output The output tensor. Data type supported: same as @p input.
+     * @param[in]  biases (Optional) The input biases to add. Shape [IFM]. Data type supported: same as @p input.
      */
-    void configure(const ICLTensor *input, ICLTensor *output);
+    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *biases = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
     const ICLTensor *_input;
+    const ICLTensor *_biases;
     ICLTensor       *_output;
 };
 } // arm_compute
diff --git a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
index d876143a3..d47b7da21 100644
--- a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
@@ -56,22 +56,37 @@ public:
      *        5x5 convolution with stride_x = 1/2, stride_y = 1/2
      *
      * @param[in]  input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32.
+     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QS8/QS16/F16/F32.
      * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
      *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
      *                       Data type supported:Same as @p input.
-     * @param[in]  biases    Biases tensor. Biases are 1D tensor with dimension [OFM]. Data type supported: Same as @p input.
+     * @param[in]  biases    Biases tensor. Biases are 1D tensor with dimension [OFM].
+     *                       Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type
      * @param[out] output    Output tensor.
      *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
      */
     void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
+    /** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayerKernel
+     *
+     * @param[in] input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
+     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
+     *                      Data type supported:Same as @p input.
+     * @param[in] biases    Biases tensor. Biases are 1D tensor with dimension [OFM]. Data type supported: Same as @p input.
+     * @param[in] output    Output tensor.
+     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in] target    Target GPU architecture.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
 
 private:
     const ICLTensor *_input;
@@ -79,8 +94,6 @@ private:
     const ICLTensor *_weights;
     ICLTensor       *_output;
     BorderSize       _border_size;
-    int              _conv_pad_x;
-    int              _conv_pad_y;
     int              _conv_stride_x;
     int              _conv_stride_y;
 };
diff --git a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
index 8e0c1836a..c87fb2cd6 100644
--- a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
@@ -64,7 +64,7 @@ public:
     CLGEMMInterleave4x4Kernel &operator=(CLGEMMInterleave4x4Kernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: same as @p input
      */
     void configure(const ICLTensor *input, ICLTensor *output);
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
index 05956aeeb..b60b80618 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
@@ -30,15 +30,15 @@ namespace arm_compute
 {
 class ICLTensor;
 
-/** OpenCL kernel to compute low precision matrix multiplication kernel
+/** OpenCL kernel to multiply matrices
  *
+ * @note @ref CLGEMMLowpMatrixMultiplyKernel low precision matrix product kernel
  *  This kernel performs the following computation:
- *  -# Convert a values from uint8 to int32 and add a_offset to each of them.
- *  -# Convert b values from uint8 to int32 and add b_offset to each of them.
- *  -# Compute the int32 matrix product of the resulting a * b.
- *  -# Add output_offset to each entry of the result.
- *  -# Multiply each entry of the result and round to the nearest integer
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
+ *
+ *  -# Convert a values from int8 to int32
+ *  -# Convert b values from int8 to int32
+ *  -# Compute the int32 matrix product of the resulting a * b and store the result as int32
+ *
  */
 class CLGEMMLowpMatrixMultiplyKernel : public ICLKernel
 {
@@ -55,19 +55,12 @@ public:
     CLGEMMLowpMatrixMultiplyKernel &operator=(CLGEMMLowpMatrixMultiplyKernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel.
-     * These two kernels change the layout of the original matrices to be more cache-friendly.
-     *
-     * @param[in]  input0          Input tensor containing the interleaved Matrix A. Data types supported: U8
-     * @param[in]  input1          Input tensor containing the transposed Matrix B. Data types supported: same as @p input0
-     * @param[out] output          Output tensor to store the result of matrix multiplication, Data types supported: same as @p input0
-     * @param[in]  a_offset        Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset        Offset to be added to each element of the matrix B.
-     * @param[in]  output_offset   Offset to be added to each element of the output matrix
-     * @param[in]  output_mult_int Offset to be added to each element of the output matrix
-     * @param[in]  shift           Number of bits to shift right the result.
+     * @param[in]  input0                    Input tensor containing the interleaved Matrix A. Data type supported: QASYMM8
+     * @param[in]  input1                    Input tensor containing the transposed1xW Matrix B. Data type supported: same as @p input0
+     * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: S32
+     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
      */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed = true);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
new file mode 100644
index 000000000..5f2e02568
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (vector_sum_col[k] * a_offset) +
+ *                   (vector_sum_row[i] * b_offset) +
+ *                   (a_offset * b_offset * k)
+ *
+ */
+class CLGEMMLowpOffsetContributionKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLGEMMLowpOffsetContributionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpOffsetContributionKernel(const CLGEMMLowpOffsetContributionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpOffsetContributionKernel &operator=(const CLGEMMLowpOffsetContributionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpOffsetContributionKernel(CLGEMMLowpOffsetContributionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in, out] mm_result      Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      k              Number of matrix A columns or Matrix B rows
+     * @param[in]      a_offset       Offset to be added to each element of the matrix A.
+     * @param[in]      b_offset       Offset to be added to each element of the matrix B.
+     */
+    void configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_vector_sum_col;
+    const ICLTensor *_vector_sum_row;
+    ICLTensor       *_mm_result;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
new file mode 100644
index 000000000..49e19e3c6
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ */
+class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input                        Input tensor. Data type supported: S32
+     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
+     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
+     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     */
+    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+     *
+     * @param[in] input  Input tensor. Data type supported: S32
+     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_bias;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
new file mode 100644
index 000000000..87b70efdf
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ */
+class CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input           Input tensor. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in]  result_offset   Offset to be added to each element of the input matrix
+     * @param[in]  result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift    Number of bits to shift right the result before converting back to QASYMM8
+     * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     */
+    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel
+     *
+     * @param[in] input  Input tensor. Data type supported: S32
+     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_bias;
+    ICLTensor       *_output;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__ */
+\ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
new file mode 100644
index 000000000..aa0583fe8
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Common interface for all OpenCL reduction kernels */
+class ICLGEMMLowpReductionKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    ICLGEMMLowpReductionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    ICLGEMMLowpReductionKernel(const ICLGEMMLowpReductionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    ICLGEMMLowpReductionKernel &operator=(const ICLGEMMLowpReductionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    ICLGEMMLowpReductionKernel(ICLGEMMLowpReductionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    ICLGEMMLowpReductionKernel &operator=(ICLGEMMLowpReductionKernel &&) = default;
+
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data type supported: S8
+     * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
+     */
+    virtual void configure(const ICLTensor *input, ICLTensor *output) = 0;
+
+protected:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class CLGEMMLowpMatrixAReductionKernel : public ICLGEMMLowpReductionKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8
+     * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+     */
+    void configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row) override;
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class CLGEMMLowpMatrixBReductionKernel : public ICLGEMMLowpReductionKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8
+     * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+     */
+    void configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col) override;
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
index 50bc64c2c..8a3772046 100644
--- a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
@@ -70,7 +70,7 @@ class CLGEMMTranspose1xWKernel : public ICLSimple2DKernel
 public:
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: same as @p input
      */
     void configure(const ICLTensor *input, ICLTensor *output);
diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
index eed683b4c..1d8b5500c 100644
--- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h
+++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
@@ -69,7 +69,7 @@ public:
     /** Set the input and output of the kernel.
      *
      * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32
+     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32
      * @param[out] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
      *                         while every dimension above represents a batch. Data types supported: Same as @p input
      * @param[in]  kernel_dims The kernel dimensions (width and height).
diff --git a/arm_compute/core/CL/kernels/CLL2NormalizeKernel.h b/arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h
index 2056b4e61..f7d717119 100644
--- a/arm_compute/core/CL/kernels/CLL2NormalizeKernel.h
+++ b/arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h
@@ -32,21 +32,21 @@ namespace arm_compute
 class ICLTensor;
 
 /** Interface for the reduction operation kernel */
-class CLL2NormalizeKernel : public ICLKernel
+class CLL2NormalizeLayerKernel : public ICLKernel
 {
 public:
     /** Default constructor */
-    CLL2NormalizeKernel();
+    CLL2NormalizeLayerKernel();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLL2NormalizeKernel(const CLL2NormalizeKernel &) = delete;
+    CLL2NormalizeLayerKernel(const CLL2NormalizeLayerKernel &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLL2NormalizeKernel &operator=(const CLL2NormalizeKernel &) = delete;
+    CLL2NormalizeLayerKernel &operator=(const CLL2NormalizeLayerKernel &) = delete;
     /** Allow instances of this class to be moved */
-    CLL2NormalizeKernel(CLL2NormalizeKernel &&) = default;
+    CLL2NormalizeLayerKernel(CLL2NormalizeLayerKernel &&) = default;
     /** Allow instances of this class to be moved */
-    CLL2NormalizeKernel &operator=(CLL2NormalizeKernel &&) = default;
+    CLL2NormalizeLayerKernel &operator=(CLL2NormalizeLayerKernel &&) = default;
     /** Default destructor */
-    ~CLL2NormalizeKernel() = default;
+    ~CLL2NormalizeLayerKernel() = default;
 
     /** Set the input and output tensors.
      *
diff --git a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
index f74f7514e..d931152cb 100644
--- a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
@@ -45,7 +45,6 @@ public:
     CLNormalizationLayerKernel(CLNormalizationLayerKernel &&) = default;
     /** Default move assignment operator. */
     CLNormalizationLayerKernel &operator=(CLNormalizationLayerKernel &&) = default;
-
     /** Set the input and output tensors.
      *
      * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -54,6 +53,16 @@ public:
      * @param[in]  norm_info Normalization layer information like the normalization type, normalization size and other parameters.
      */
     void configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel
+     *
+     * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                      and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/F16/F32.
+     * @param[in] output    Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input.
+     * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
index 309a202df..6746a49dd 100644
--- a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
@@ -59,6 +59,20 @@ public:
      */
     void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
                    ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplicationKernel
+     *
+     * @param[in] input1          An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
+     * @param[in] output          The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+     * @param[in] scale           Scale to apply after multiplication.
+     *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+     * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+     * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
+                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
index 9251a8ed9..e9ce28b3f 100644
--- a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
@@ -26,6 +26,8 @@
 
 #include "arm_compute/core/CL/ICLKernel.h"
 
+#include "arm_compute/core/Error.h"
+
 namespace arm_compute
 {
 class ICLTensor;
@@ -51,11 +53,20 @@ public:
      *
      * @note QS8 and QS16 are supported only for pool sizes 3, 5 and 7
      *
-     * @param[in]  input     Source tensor. Data types supported: QS8/QS16/F16/F32.
+     * @param[in]  input     Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
     void configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayerKernel
+     *
+     * @param[in] input     Source tensor info. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
+     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h b/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
index d8ccfa88c..044b5e700 100644
--- a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
@@ -49,7 +49,7 @@ public:
     ~CLReshapeLayerKernel() = default;
     /** Set the input and output of the kernel
      *
-     * @param[in]  input  Source tensor. Data type supported: U8/S8/QS8/U16/S16/QS16/U32/S32/F16/F32
+     * @param[in]  input  Source tensor. Data type supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
      * @param[out] output Destination tensor. Data type supported: Same as @p input
      */
     void configure(const ICLTensor *input, ICLTensor *output);
diff --git a/arm_compute/core/CL/kernels/CLScaleKernel.h b/arm_compute/core/CL/kernels/CLScaleKernel.h
index db0587d6a..3bca6efd0 100644
--- a/arm_compute/core/CL/kernels/CLScaleKernel.h
+++ b/arm_compute/core/CL/kernels/CLScaleKernel.h
@@ -42,8 +42,9 @@ public:
      *                              All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  policy           Interpolation type to use
      * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     * @param[in]  sampling_policy  (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
      */
-    void configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined);
+    void configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy = SamplingPolicy::CENTER);
 
     // Inherited methods overridden:
     BorderSize border_size() const override;
diff --git a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
index 1e641b48d..c072d2a6d 100644
--- a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
@@ -26,6 +26,8 @@
 
 #include "arm_compute/core/CL/ICLSimple3DKernel.h"
 
+#include <tuple>
+
 namespace arm_compute
 {
 class ICLTensor;
@@ -36,13 +38,21 @@ class CLLogits1DMaxKernel : public ICLSimple3DKernel
 public:
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in]  input  Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32
      * @param[out] output Destination tensor. Data types supported: same as @p input
      */
     void configure(const ICLTensor *input, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DMaxKernel
+     *
+     * @param[in] input  Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32
+     * @param[in] output Destination tensor. Data types supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 };
 
-/** Interface for shifting the logits values around the max value and exponentiating the result */
+/** Interface for shifting, exponentiating and summing the logits */
 class CLLogits1DShiftExpSumKernel : public ICLKernel
 {
 public:
@@ -58,12 +68,23 @@ public:
     CLLogits1DShiftExpSumKernel &operator=(CLLogits1DShiftExpSumKernel &&) = default;
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in]  input  Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32
      * @param[in]  max    Max values tensor. Data types supported: same as @p input
-     * @param[out] output Destination tensor. Data types supported: same as @p input
-     * @param[out] sum    Sum of 1D logits tensor. Data types supported: same as @p input
+     * @param[out] output Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
+     * @param[out] sum    Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
+     * @param[in]  beta   (Optional) A scaling factor for the exponent. Defaults to 1.0
      */
-    void configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum);
+    void configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DShiftExpSumKernel
+     *
+     * @param[in] input  Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32
+     * @param[in] max    Max values tensor. Data types supported: same as @p input
+     * @param[in] output Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
+     * @param[in] sum    Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -75,6 +96,68 @@ private:
     ICLTensor       *_sum;
 };
 
+/** Interface for max, shifting, exponentiating and summing the logits */
+class CLLogits1DMaxShiftExpSumKernel : public ICLKernel
+{
+public:
+    using ParallelReductionInfo = std::tuple<bool, unsigned int>;
+
+public:
+    /** Default constructor */
+    CLLogits1DMaxShiftExpSumKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogits1DMaxShiftExpSumKernel(const CLLogits1DMaxShiftExpSumKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogits1DMaxShiftExpSumKernel &operator=(const CLLogits1DMaxShiftExpSumKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLogits1DMaxShiftExpSumKernel(CLLogits1DMaxShiftExpSumKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLogits1DMaxShiftExpSumKernel &operator=(CLLogits1DMaxShiftExpSumKernel &&) = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]     input  Source tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in,out] max    Max values tensor. Data types supported: same as @p input
+     * @param[out]    output Destination tensor. Data types supported: same as @p input
+     * @param[out]    sum    Sum of 1D logits tensor. Data types supported: same as @p input
+     * @param[in]     beta   (Optional) A scaling factor for the exponent. Defaults to 1.f
+     */
+    void configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DMaxShiftExpSumKernel
+     *
+     * @param[in] input  Source tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in] max    Max values tensor. Data types supported: same as @p input
+     * @param[in] output Destination tensor. Data types supported: same as @p input
+     * @param[in] sum    Sum of 1D logits tensor. Data types supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum);
+    /** Checks if the given size is eligible for parallel reduction
+     *
+     * @note  Serial reduction is launched for width < (_grid_size * _serial_vector_size).
+     * @note  Parallel reduction is launched for width >= (_grid_size * _serial_vector_size) and vector_size is forced to 4.
+     *
+     * @param[in] size Size to check
+     *
+     * @return A two-element tuple where the first element is a boolean specifying is a parallel reduction will be run,
+     *         while the second elements is the vector size of the execution.
+     */
+    static ParallelReductionInfo is_parallel_reduction(size_t size);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_max;
+    ICLTensor       *_output;
+    ICLTensor       *_sum;
+
+private:
+    static const unsigned int _grid_size;
+    static const unsigned int _serial_vector_size;
+    static const unsigned int _parallel_vector_size;
+};
 /** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
 class CLLogits1DNormKernel : public ICLKernel
 {
@@ -91,11 +174,21 @@ public:
     CLLogits1DNormKernel &operator=(CLLogits1DNormKernel &&) = default;
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/S32/F16/F32
      * @param[in]  sum    Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[out] output Destination tensor. Data types supported: same as @p input
+     * @param[out] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
+     * @param[in]  beta   (Optional) A scaling factor for the exponent. (Default = 1.0)
+     */
+    void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, float beta = 1.0f);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DNormKernel
+     *
+     * @param[in] input  Source tensor. Data types supported: QS8/QS16/S32/F16/F32
+     * @param[in] sum    Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
+     * @param[in] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
+     *
+     * @return a status
      */
-    void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLTransposeKernel.h b/arm_compute/core/CL/kernels/CLTransposeKernel.h
index faccf5e37..2e1b481d3 100644
--- a/arm_compute/core/CL/kernels/CLTransposeKernel.h
+++ b/arm_compute/core/CL/kernels/CLTransposeKernel.h
@@ -40,10 +40,18 @@ class CLTransposeKernel : public ICLSimple2DKernel
 public:
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
     void configure(const ICLTensor *input, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLTransposeKernel
+     *
+     * @param[in] input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] output Output tensor. Data type supported: Same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLTRANSPOSEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
index 07c7c772c..6c84ded49 100644
--- a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
+++ b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
@@ -47,9 +47,10 @@ public:
     /** Set the input and output of the kernel.
      *
      * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: QS8/QS16/F16/F32
+     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: QS8/QS16/QASYMM8/F16/F32
      * @param[in]  biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
      *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
+     *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
      * @param[out] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input
      */
     void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output);
diff --git a/arm_compute/core/CPP/CPPKernels.h b/arm_compute/core/CPP/CPPKernels.h
index 1eabfa943..f55f41b0e 100644
--- a/arm_compute/core/CPP/CPPKernels.h
+++ b/arm_compute/core/CPP/CPPKernels.h
@@ -27,6 +27,7 @@
 /* Header regrouping all the CPP kernels */
 #include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPPermuteKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
 
 #endif /* __ARM_COMPUTE_CPPKERNELS_H__ */
diff --git a/arm_compute/core/CPP/kernels/CPPPermuteKernel.h b/arm_compute/core/CPP/kernels/CPPPermuteKernel.h
new file mode 100644
index 000000000..0e7c93877
--- /dev/null
+++ b/arm_compute/core/CPP/kernels/CPPPermuteKernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPPERMUTEKERNEL_H__
+#define __ARM_COMPUTE_CPPPERMUTEKERNEL_H__
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** CPP kernel to perform tensor permutation.
+ *
+ * Permutes given a permutation vector
+ */
+class CPPPermuteKernel : public ICPPKernel
+{
+public:
+    /** Default constructor */
+    CPPPermuteKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPPermuteKernel(const CPPPermuteKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPPermuteKernel &operator=(const CPPPermuteKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CPPPermuteKernel(CPPPermuteKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CPPPermuteKernel &operator=(CPPPermuteKernel &&) = default;
+    /** Default destructor */
+    ~CPPPermuteKernel() = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input  The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[out] output The output tensor. Data types supported: Same as @p input
+     * @param[in]  perm   Permutation vector
+     */
+    void configure(const ITensor *input, ITensor *output, const PermutationVector &perm);
+    /** Static function to check if given info will lead to a valid configuration of @ref CPPPermuteKernel
+     *
+     * @param[in] input  The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] output The output tensor. Data types supported: Same as @p input
+     * @param[in] perm   Permutation vector
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Template function to run the permute
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T>
+    void run_permute(const Window &window);
+
+    /** Common signature for all the specialised permute functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using PermuteFunctionPtr = void (CPPPermuteKernel::*)(const Window &window);
+
+    PermuteFunctionPtr _func;
+    const ITensor     *_input;
+    ITensor           *_output;
+    PermutationVector  _perm;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CPPPERMUTEKERNEL_H__ */
diff --git a/arm_compute/core/Dimensions.h b/arm_compute/core/Dimensions.h
index 96dd3711c..ae8d6c350 100644
--- a/arm_compute/core/Dimensions.h
+++ b/arm_compute/core/Dimensions.h
@@ -100,7 +100,20 @@ public:
      *
      * @return The size of the requested dimension.
      */
-    T operator[](size_t dimension) const
+    const T &operator[](size_t dimension) const
+    {
+        ARM_COMPUTE_ERROR_ON(dimension >= num_max_dimensions);
+        return _id[dimension];
+    }
+    /** Generic accessor to get the size of any dimension
+     *
+     * @note Precondition: dimension < Dimensions::num_max_dimensions
+     *
+     * @param[in] dimension Dimension of the wanted size
+     *
+     * @return The size of the requested dimension.
+     */
+    T &operator[](size_t dimension)
     {
         ARM_COMPUTE_ERROR_ON(dimension >= num_max_dimensions);
         return _id[dimension];
@@ -119,8 +132,8 @@ public:
 
     /** Collapse dimensions.
      *
-     * @param[in] first Dimensions into which the following @p n are collapsed.
      * @param[in] n     Number of dimensions to collapse into @p first.
+     * @param[in] first Dimensions into which the following @p n are collapsed.
      */
     void collapse(size_t n, size_t first = 0)
     {
@@ -141,6 +154,17 @@ public:
         std::fill(_id.begin() + _num_dimensions, _id.end(), 0);
     }
 
+    /** Collapse dimensions starting from a given point
+     *
+     * @param[in] start Starting point of collapsing dimensions
+     */
+    void collapse_from(size_t start)
+    {
+        ARM_COMPUTE_ERROR_ON(start > num_dimensions());
+
+        collapse(num_dimensions() - start, start);
+    }
+
     /** Returns a read/write iterator that points to the first element in the dimension array. */
     typename std::array<T, num_max_dimensions>::iterator begin()
     {
@@ -179,5 +203,16 @@ protected:
     std::array<T, num_max_dimensions> _id;
     size_t _num_dimensions{ 0 };
 };
+
+template <typename T>
+inline bool operator==(const Dimensions<T> &lhs, const Dimensions<T> &rhs)
+{
+    return ((lhs.num_dimensions() == rhs.num_dimensions()) && std::equal(lhs.cbegin(), lhs.cend(), rhs.cbegin()));
+}
+template <typename T>
+inline bool operator!=(const Dimensions<T> &lhs, const Dimensions<T> &rhs)
+{
+    return !(lhs == rhs);
+}
 }
 #endif /*__ARM_COMPUTE_DIMENSIONS_H__*/
diff --git a/arm_compute/core/Error.h b/arm_compute/core/Error.h
index c4c452bac..97dbba3fa 100644
--- a/arm_compute/core/Error.h
+++ b/arm_compute/core/Error.h
@@ -24,55 +24,236 @@
 #ifndef __ARM_COMPUTE_ERROR_H__
 #define __ARM_COMPUTE_ERROR_H__
 
-/** Print the given message then throw an std::runtime_error.
+#include <stdarg.h>
+#include <string>
+
+namespace arm_compute
+{
+enum class ErrorCode
+{
+    OK,           /**< No error */
+    RUNTIME_ERROR /**< Generic runtime error */
+};
+
+/** Status class */
+class Status
+{
+public:
+    /** Default Constructor **/
+    Status()
+        : _code(ErrorCode::OK), _error_description(" ")
+    {
+    }
+    /** Default Constructor
+     *
+     * @param error_status      Error status.
+     * @param error_description (Optional) Error description if error_status is not valid.
+     */
+    explicit Status(ErrorCode error_status, std::string error_description = " ")
+        : _code(error_status), _error_description(error_description)
+    {
+    }
+    /** Allow instances of this class to be copy constructed */
+    Status(const Status &) = default;
+    /** Allow instances of this class to be move constructed */
+    Status(Status &&) = default;
+    /** Allow instances of this class to be copy assigned */
+    Status &operator=(const Status &) = default;
+    /** Allow instances of this class to be move assigned */
+    Status &operator=(Status &&) = default;
+    /** Explicit bool conversion operator
+     *
+     * @return True if there is no error else false
+     */
+    explicit operator bool() const noexcept
+    {
+        return _code == ErrorCode::OK;
+    }
+    /** Gets error code
+     *
+     * @return Error code.
+     */
+    ErrorCode error_code() const
+    {
+        return _code;
+    }
+    /** Gets error description if any
+     *
+     * @return Error description.
+     */
+    std::string error_description() const
+    {
+        return _error_description;
+    }
+    /** Throws a runtime exception in case it contains a valid error status */
+    void throw_if_error()
+    {
+        if(!bool(*this))
+        {
+            internal_throw_on_error();
+        }
+    }
+
+private:
+    /** Internal throwing function */
+    [[noreturn]] void internal_throw_on_error();
+
+private:
+    ErrorCode   _code;
+    std::string _error_description;
+};
+
+/** Creates an error containing the error message from variable argument list
  *
- * @param[in] ... Message to display before aborting.
+ * @param[in] error_code Error code
+ * @param[in] function   Function in which the error occurred.
+ * @param[in] file       Name of the file where the error occurred.
+ * @param[in] line       Line on which the error occurred.
+ * @param[in] msg        Message to display before aborting.
+ * @param[in] args       Variable argument list of the message.
+ *
+ * @return status containing the error
  */
-#define ARM_COMPUTE_ERROR(...) ::arm_compute::error(__func__, __FILE__, __LINE__, __VA_ARGS__) // NOLINT
-
-/** Print the given message then throw an std::runtime_error.
+Status create_error_va_list(ErrorCode error_code, const char *function, const char *file, const int line, const char *msg, va_list args);
+/** Creates an error containing the error message
  *
- * @param[in] func Function in which the error occurred.
- * @param[in] file File in which the error occurred.
- * @param[in] line Line in which the error occurred.
- * @param[in] ...  Message to display before aborting.
+ * @param[in] error_code Error code
+ * @param[in] function   Function in which the error occurred.
+ * @param[in] file       Name of the file where the error occurred.
+ * @param[in] line       Line on which the error occurred.
+ * @param[in] msg        Message to display before aborting.
+ * @param[in] ...        Variable number of arguments of the message.
+ *
+ * @return status containing the error
  */
-#define ARM_COMPUTE_ERROR_LOC(func, file, line, ...) ::arm_compute::error(func, file, line, __VA_ARGS__) // NOLINT
-
+Status create_error(ErrorCode error_code, const char *function, const char *file, const int line, const char *msg, ...);
+/** Print an error message then throw an std::runtime_error
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] msg      Message to display before aborting.
+ * @param[in] ...      Variable number of arguments of the message.
+ */
+[[noreturn]] void error(const char *function, const char *file, const int line, const char *msg, ...);
+}
 /** To avoid unused variables warnings
  *
  * This is useful if for example a variable is only used
  * in debug builds and generates a warning in release builds.
  *
- * @param[in] var Variable which is unused
+ * @param[in] var Variable which is unused.
  */
 #define ARM_COMPUTE_UNUSED(var) (void)(var)
 
-#ifdef ARM_COMPUTE_DEBUG_ENABLED
-/** Print the given message
+/** Creates an error with a given message
+ *
+ * @param[in] error_code Error code.
+ * @param[in] ...        Message to encapsulate.
+ */
+#define ARM_COMPUTE_CREATE_ERROR(error_code, ...) ::arm_compute::create_error(error_code, __func__, __FILE__, __LINE__, __VA_ARGS__) // NOLINT
+
+/** Creates an error on location with a given message
  *
- * @param[in] ... Message to display
+ * @param[in] error_code Error code.
+ * @param[in] func       Function in which the error occurred.
+ * @param[in] file       File in which the error occurred.
+ * @param[in] line       Line in which the error occurred.
+ * @param[in] ...        Message to display before aborting.
  */
-#define ARM_COMPUTE_INFO(...) ::arm_compute::debug(__func__, __FILE__, __LINE__, __VA_ARGS__) // NOLINT
-/** If the condition is true, the given message is printed
+#define ARM_COMPUTE_CREATE_ERROR_LOC(error_code, func, file, line, ...) ::arm_compute::create_error(error_code, func, file, line, __VA_ARGS__) // NOLINT
+
+/** Checks if a status contains an error and returns it
+ *
+ * @param[in] status Status value to check
+ */
+#define ARM_COMPUTE_RETURN_ON_ERROR(status) \
+    do                                      \
+    {                                       \
+        if(!bool(status))                   \
+        {                                   \
+            return status;                  \
+        }                                   \
+    } while(false)
+
+/** Checks if an error value is valid if not throws an exception with the error
+ *
+ * @param[in] error Error value to check.
+ */
+#define ARM_COMPUTE_THROW_ON_ERROR(error) \
+    error.throw_if_error();
+
+/** If the condition is true, an error is returned
  *
  * @param[in] cond Condition to evaluate.
- * @param[in] ...  Message to print if cond is false.
+ * @param[in] ...  Error description message
  */
-#define ARM_COMPUTE_INFO_ON_MSG(cond, ...) \
-    do                                     \
-    {                                      \
-        if(cond)                           \
-        {                                  \
-            ARM_COMPUTE_INFO(__VA_ARGS__); \
-        }                                  \
-    } while(0)
-#else /* ARM_COMPUTE_DEBUG_ENABLED */
-#define ARM_COMPUTE_INFO_ON_MSG(cond, ...)
-#define ARM_COMPUTE_INFO(...)
-#endif /* ARM_COMPUTE_DEBUG_ENABLED */
+#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, ...)                                               \
+    do                                                                                           \
+    {                                                                                            \
+        if(cond)                                                                                 \
+        {                                                                                        \
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, __VA_ARGS__); \
+        }                                                                                        \
+    } while(false)
+
+/** If the condition is true, an error is thrown
+ *
+ * @param[in] cond Condition to evaluate.
+ * @param[in] func Function in which the error occurred.
+ * @param[in] file File in which the error occurred.
+ * @param[in] line Line in which the error occurred.
+ * @param[in] ...  Error description message.
+ */
+#define ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(cond, func, file, line, ...)                                               \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        if(cond)                                                                                                       \
+        {                                                                                                              \
+            return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::RUNTIME_ERROR, func, file, line, __VA_ARGS__); \
+        }                                                                                                              \
+    } while(false)
+
+/** If the condition is true, an error is returned
+ *
+ * @param[in] cond Condition to evaluate
+ */
+#define ARM_COMPUTE_RETURN_ERROR_ON(cond) \
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, #cond)
+
+/** If the condition is true, an error is returned
+ *
+ * @param[in] cond Condition to evaluate.
+ * @param[in] func Function in which the error occurred.
+ * @param[in] file File in which the error occurred.
+ * @param[in] line Line in which the error occurred.
+ */
+#define ARM_COMPUTE_RETURN_ERROR_ON_LOC(cond, func, file, line) \
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(cond, func, file, line, #cond)
+
+/** Print the given message then throw an std::runtime_error.
+ *
+ * @param[in] ... Message to display before aborting.
+ */
+#define ARM_COMPUTE_ERROR(...) ::arm_compute::error(__func__, __FILE__, __LINE__, __VA_ARGS__) // NOLINT
+
+/** Print the given message then throw an std::runtime_error.
+ *
+ * @param[in] func Function in which the error occurred.
+ * @param[in] file File in which the error occurred.
+ * @param[in] line Line in which the error occurred.
+ * @param[in] ...  Message to display before aborting.
+ */
+#define ARM_COMPUTE_ERROR_LOC(func, file, line, ...) ::arm_compute::error(func, file, line, __VA_ARGS__) // NOLINT
 
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
+/** Checks if a status value is valid if not throws an exception with the error
+ *
+ * @param[in] status Status value to check.
+ */
+#define ARM_COMPUTE_ERROR_THROW_ON(status) \
+    status.throw_if_error()
+
 /** If the condition is true, the given message is printed and an exception is thrown
  *
  * @param[in] cond Condition to evaluate.
@@ -112,6 +293,7 @@
  */
 #define ARM_COMPUTE_CONST_ON_ERROR(cond, val, msg) (cond) ? throw std::logic_error(msg) : val;
 #else /* ARM_COMPUTE_ASSERTS_ENABLED */
+#define ARM_COMPUTE_ERROR_THROW_ON(status)
 #define ARM_COMPUTE_ERROR_ON_MSG(cond, ...)
 #define ARM_COMPUTE_ERROR_ON_LOC_MSG(cond, func, file, line, ...)
 #define ARM_COMPUTE_CONST_ON_ERROR(cond, val, msg) val
@@ -119,14 +301,14 @@
 
 /** If the condition is true then an error message is printed and an exception thrown
  *
- * @param[in] cond Condition to evaluate
+ * @param[in] cond Condition to evaluate.
  */
 #define ARM_COMPUTE_ERROR_ON(cond) \
     ARM_COMPUTE_ERROR_ON_MSG(cond, #cond)
 
 /** If the condition is true then an error message is printed and an exception thrown
  *
- * @param[in] cond Condition to evaluate
+ * @param[in] cond Condition to evaluate.
  * @param[in] func Function in which the error occurred.
  * @param[in] file File in which the error occurred.
  * @param[in] line Line in which the error occurred.
@@ -134,27 +316,4 @@
 #define ARM_COMPUTE_ERROR_ON_LOC(cond, func, file, line) \
     ARM_COMPUTE_ERROR_ON_LOC_MSG(cond, func, file, line, #cond)
 
-namespace arm_compute
-{
-/** Print an error message then throw an std::runtime_error
- *
- * @param[in] function Function in which the error occurred.
- * @param[in] file     Name of the file where the error occurred.
- * @param[in] line     Line on which the error occurred.
- * @param[in] msg      Message to display before aborting.
- * @param[in] ...      Variable number of arguments of the message.
- */
-[[noreturn]] void error(const char *function, const char *file, const int line, const char *msg, ...);
-
-/** Print a debug message
- *
- * @param[in] function Function in which the error occurred.
- * @param[in] file     Name of the file where the error occurred.
- * @param[in] line     Line on which the error occurred.
- * @param[in] msg      Message to display before aborting.
- * @param[in] ...      Variable number of arguments of the message.
- */
-void debug(const char *function, const char *file, const int line, const char *msg, ...);
-}
-
 #endif /* __ARM_COMPUTE_ERROR_H__ */
diff --git a/arm_compute/core/FixedPoint.h b/arm_compute/core/FixedPoint.h
index 82c2d3347..6e00500b1 100644
--- a/arm_compute/core/FixedPoint.h
+++ b/arm_compute/core/FixedPoint.h
@@ -225,96 +225,96 @@ qint16_t sqmull_qs8(qint8_t a, qint8_t b, int fixed_point_position);
 qint32_t sqmull_qs16(qint16_t a, qint16_t b, int fixed_point_position);
 
 /** 16 bit fixed point scalar saturating multiply
-*
-* @param[in] a                    First 16 bit fixed point input
-* @param[in] b                    Second 16 bit fixed point input
-* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
-*
-* @return The result of the 16 bit fixed point multiplication. The result is saturated in case of overflow
-*/
+ *
+ * @param[in] a                    First 16 bit fixed point input
+ * @param[in] b                    Second 16 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point multiplication. The result is saturated in case of overflow
+ */
 qint16_t sqmul_qs16(qint16_t a, qint16_t b, int fixed_point_position);
 
 /** 8 bit fixed point scalar inverse square root
-*
-* @param[in] a                    8 bit fixed point input
-* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
-*
-* @return The result of the 8 bit fixed point inverse square root.
-*/
+ *
+ * @param[in] a                    8 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point inverse square root.
+ */
 qint8_t sinvsqrt_qs8(qint8_t a, int fixed_point_position);
 
 /** 16 bit fixed point scalar inverse square root
-*
-* @param[in] a                    16 bit fixed point input
-* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
-*
-* @return The result of the 16 bit fixed point inverse square root.
-*/
+ *
+ * @param[in] a                    16 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point inverse square root.
+ */
 qint16_t sinvsqrt_qs16(qint16_t a, int fixed_point_position);
 
 /** 8 bit fixed point scalar division
-*
-* @param[in] a                    First 8 bit fixed point input
-* @param[in] b                    Second 8 bit fixed point input
-* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
-*
-* @return The result of the 8 bit fixed point division.
-*/
+ *
+ * @param[in] a                    First 8 bit fixed point input
+ * @param[in] b                    Second 8 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point division.
+ */
 qint8_t sdiv_qs8(qint8_t a, qint8_t b, int fixed_point_position);
 
 /** 16 bit fixed point scalar division
-*
-* @param[in] a                    First 16 bit fixed point input
-* @param[in] b                    Second 16 bit fixed point input
-* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
-*
-* @return The result of the 16 bit fixed point division.
-*/
+ *
+ * @param[in] a                    First 16 bit fixed point input
+ * @param[in] b                    Second 16 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point division.
+ */
 qint16_t sdiv_qs16(qint16_t a, qint16_t b, int fixed_point_position);
 
 /** 8 bit fixed point scalar exponential
-*
-* @param[in] a                    8 bit fixed point input
-* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
-*
-* @return The result of the 8 bit fixed point exponential.
-*/
+ *
+ * @param[in] a                    8 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point exponential.
+ */
 qint8_t sqexp_qs8(qint8_t a, int fixed_point_position);
 
 /** 16 bit fixed point scalar exponential
-*
-* @param[in] a                    16 bit fixed point input
-* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
-*
-* @return The result of the 16 bit fixed point exponential.
-*/
+ *
+ * @param[in] a                    16 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point exponential.
+ */
 qint16_t sqexp_qs16(qint16_t a, int fixed_point_position);
 
 /** 16 bit fixed point scalar exponential
-*
-* @param[in] a                    16 bit fixed point input
-* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
-*
-* @return The result of the 16 bit fixed point exponential.
-*/
+ *
+ * @param[in] a                    16 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point exponential.
+ */
 qint16_t sexp_qs16(qint16_t a, int fixed_point_position);
 
 /** 8 bit fixed point scalar logarithm
-*
-* @param[in] a                    8 bit fixed point input
-* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
-*
-* @return The result of the 8 bit fixed point logarithm.
-*/
+ *
+ * @param[in] a                    8 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point logarithm.
+ */
 qint8_t slog_qs8(qint8_t a, int fixed_point_position);
 
 /** 16 bit fixed point scalar logarithm
-*
-* @param[in] a                    16 bit fixed point input
-* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
-*
-* @return The result of the 16 bit fixed point logarithm.
-*/
+ *
+ * @param[in] a                    16 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point logarithm.
+ */
 qint16_t slog_qs16(qint16_t a, int fixed_point_position);
 
 /** Convert an 8 bit fixed point to float
diff --git a/arm_compute/core/GLES_COMPUTE/GCHelpers.h b/arm_compute/core/GLES_COMPUTE/GCHelpers.h
new file mode 100644
index 000000000..475554f2b
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/GCHelpers.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCHELPERS_H__
+#define __ARM_COMPUTE_GCHELPERS_H__
+
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "support/ToolchainSupport.h"
+
+#include <string>
+
+namespace arm_compute
+{
+/** Helper function to create and return a unique_ptr pointed to a GLES kernel object
+ *  It also calls the kernel's configuration.
+ *
+ * @param[in] args All the arguments that need pass to kernel's configuration.
+ *
+ * @return A unique pointer pointed to a GLES kernel object
+ */
+template <typename Kernel, typename... T>
+std::unique_ptr<Kernel> create_configure_kernel(T &&... args)
+{
+    std::unique_ptr<Kernel> k = arm_compute::support::cpp14::make_unique<Kernel>();
+    k->configure(std::forward<T>(args)...);
+    return k;
+}
+
+/** Helper function to create and return a unique_ptr pointed to a GLES kernel object
+ *
+ * @return A unique pointer pointed to a GLES kernel object
+ */
+template <typename Kernel>
+std::unique_ptr<Kernel> create_kernel()
+{
+    std::unique_ptr<Kernel> k = arm_compute::support::cpp14::make_unique<Kernel>();
+    return k;
+}
+
+/** Max vector width of an GLES vector */
+static constexpr unsigned int max_gc_vector_width = 16;
+}
+#endif /* __ARM_COMPUTE_GCHELPERS_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h b/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h
new file mode 100644
index 000000000..082732904
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCKERNELLIBRARY_H__
+#define __ARM_COMPUTE_GCKERNELLIBRARY_H__
+
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Utils.h"
+
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace arm_compute
+{
+/** GCProgram class */
+class GCProgram
+{
+public:
+    /** Default constructor. */
+    GCProgram();
+    /** Construct program from source file.
+     *
+     * @param[in] name   Program name.
+     * @param[in] source Program source.
+     */
+    GCProgram(std::string name, std::string source);
+    /** Default Copy Constructor. */
+    GCProgram(const GCProgram &) = default;
+    /** Default Move Constructor. */
+    GCProgram(GCProgram &&) = default;
+    /** Default copy assignment operator. */
+    GCProgram &operator=(const GCProgram &) = default;
+    /** Default move assignment operator. */
+    GCProgram &operator=(GCProgram &&) = default;
+    /** Returns program name.
+     *
+     * @return Program's name.
+     */
+    std::string name() const
+    {
+        return _name;
+    }
+    /** Link program.
+     *
+     * @param[in] shader Shader used to link program.
+     *
+     * @return linked program id .
+     */
+    GLuint link_program(GLuint shader);
+    /** Compile shader.
+     *
+     * @param[in] build_options Shader build options.
+     *
+     * @return GLES shader object.
+     */
+    GLuint compile_shader(const std::string &build_options);
+
+private:
+    std::string _name;   /**< Program name. */
+    std::string _source; /**< Source code for the program. */
+};
+
+/** GCKernel class */
+class GCKernel
+{
+public:
+    /** Default Constructor. */
+    GCKernel();
+    /** Default Copy Constructor. */
+    GCKernel(const GCKernel &) = default;
+    /** Default Move Constructor. */
+    GCKernel(GCKernel &&) = default;
+    /** Default copy assignment operator. */
+    GCKernel &operator=(const GCKernel &) = default;
+    /** Default move assignment operator. */
+    GCKernel &operator=(GCKernel &&) = default;
+    /** Constructor.
+     *
+     * @param[in] name    Kernel name.
+     * @param[in] program Built program.
+     */
+    GCKernel(std::string name, GLuint program);
+    /** Destructor.
+     */
+    ~GCKernel();
+    /** Returns kernel name.
+     *
+     * @return Kernel's name.
+     */
+    std::string name() const
+    {
+        return _name;
+    }
+    /** Get program id.
+     *
+     * @return program id.
+     */
+    GLuint get_program() const
+    {
+        return _program;
+    }
+    /** Use current program.
+     *
+     * @return program id.
+     */
+    void use();
+    /** Unuse current program.
+     *
+     * @return program id.
+     */
+    void unuse();
+    /** Set argument value at index of shader params.
+     *
+     * @param[in] idx   Index in shader params.
+     * @param[in] value Argument value to be set.
+     */
+    template <class T>
+    void set_argument(unsigned int idx, T value)
+    {
+        if(idx >= _shader_arguments.size())
+        {
+            _shader_arguments.resize(idx + 1, 0);
+        }
+
+        unsigned int *p        = reinterpret_cast<unsigned int *>(&value);
+        _shader_arguments[idx] = *p;
+    }
+    /** Clear shader arguments.
+     *
+     */
+    void clear_arguments()
+    {
+        _shader_arguments.clear();
+    }
+    /** Set shader params binding point.
+     *
+     * @param[in] binding Shader params binding point.
+     */
+    void set_shader_params_binding_point(unsigned int binding)
+    {
+        _shader_params_binding_point = binding;
+    }
+    /** Update shader params.
+     *
+     */
+    void update_shader_params();
+    /** Clean up program and ubo.
+     *
+     */
+    void cleanup();
+
+private:
+    std::string                  _name;                                 /**< Kernel name */
+    GLuint                       _program;                              /**< Linked program id */
+    std::vector<unsigned int>    _shader_arguments;                     /**< Store all the values of the shader arguments */
+    GLuint                       _shader_params_ubo_name;               /**< Uniform buffer object name for shader parameters */
+    GLuint                       _shader_params_binding_point;          /**< The binding point of the uniform block for shader parameters */
+    GLuint                       _shader_params_index;                  /**< The index of the uniform block */
+    GLint                        _shader_params_size;                   /**< The uniform block data size in the shader */
+    static constexpr const char *_shader_params_name = "shader_params"; /**< The uniform block name in the shader */
+};
+
+/** GCKernelLibrary class */
+class GCKernelLibrary
+{
+    using StringSet = std::set<std::string>;
+
+private:
+    /** Default Constructor. */
+    GCKernelLibrary();
+
+public:
+    /** Prevent instances of this class from being copied. */
+    GCKernelLibrary(const GCKernelLibrary &) = delete;
+    /** Prevent instances of this class from being copied. */
+    const GCKernelLibrary &operator=(const GCKernelLibrary &) = delete;
+    /** Default Destructor. */
+    ~GCKernelLibrary();
+
+    static GCKernelLibrary &get();
+    /** Initialises the kernel library.
+     *
+     * @param[in] shader_path (Optional) Path of the directory from which shader sources are loaded.
+     * @param[in] dpy         (Optional) EGLdisplay set by external application.
+     * @param[in] ctx         (Optional) EGLContext set by external application.
+     */
+    void init(std::string shader_path = "./", EGLDisplay dpy = EGL_NO_DISPLAY, EGLContext ctx = EGL_NO_CONTEXT)
+    {
+        _shader_path = std::move(shader_path);
+
+        _display = dpy;
+        _context = ctx;
+
+        if(_display == EGL_NO_DISPLAY || _context == EGL_NO_CONTEXT)
+        {
+            setup_context();
+
+            _own_context = true;
+        }
+
+        eglMakeCurrent(_display, EGL_NO_SURFACE, EGL_NO_SURFACE, _context);
+        setup_dummy_fbo();
+    }
+
+    /** Sets the path that the shaders reside in.
+     *
+     * @param[in] shader_path Path of the shader.
+     */
+    void set_shader_path(const std::string &shader_path)
+    {
+        _shader_path = shader_path;
+    };
+    /** Sets display and context to create kernel.
+     *
+     * @param[in] dpy EGLdisplay set by external application.
+     * @param[in] ctx EGLContext set by external application.
+     */
+    void set_context(EGLDisplay dpy, EGLContext ctx)
+    {
+        _display = dpy;
+        _context = ctx;
+
+        eglMakeCurrent(dpy, EGL_NO_SURFACE, EGL_NO_SURFACE, ctx);
+        setup_dummy_fbo();
+    };
+    /** Creates a kernel from the kernel library.
+     *
+     * @param[in] shader_name       Shader name.
+     * @param[in] build_options_set Shader build options as a set.
+     *
+     * @return The created kernel.
+     */
+    GCKernel create_kernel(const std::string &shader_name, const StringSet &build_options_set = {}) const;
+    /** Serializes and saves programs to a binary.
+     *
+     */
+    void save_binary();
+    /** Load serialized binary with all the programs.
+     *
+     */
+    void load_binary();
+    /** Setup a dummy fbo to workaround an issue on Galaxy S8.
+     *
+     */
+    void setup_dummy_fbo();
+
+private:
+    /** Preprocess GLES shader
+     *
+     * @param[in] shader_source Source code of the shader to preprocess.
+     *
+     * @return Preprocessed GLES shader object.
+     */
+    const std::string preprocess_shader(const std::string &shader_source) const;
+    /** Load program and its dependencies.
+     *
+     * @param[in] program_name Name of the program to load.
+     */
+    const GCProgram &load_program(const std::string &program_name) const;
+    /** Concatenates contents of a set into a single string.
+     *
+     * @param[in] s Input set to concatenate.
+     *
+     * @return Concatenated string.
+     */
+    std::string stringify_set(const StringSet &s) const;
+    /** Set up EGL context.
+     */
+    void setup_context();
+
+    EGLDisplay  _display;                                                /**< Underlying EGL Display. */
+    EGLContext  _context;                                                /**< Underlying EGL Context. */
+    GLuint      _frame_buffer;                                           /**< Dummy fbo */
+    GLuint      _tex_rt;                                                 /**< Dummy texture for render target */
+    bool        _own_context;                                            /**< Self created context or not. */
+    std::string _shader_path;                                            /**< Path to the shaders folder. */
+    mutable std::map<std::string, const GCProgram>  _programs_map;       /**< Map with all already loaded program data. */
+    mutable std::map<std::string, const GCKernel>   _built_programs_map; /**< Map with all already built program data. */
+    static const std::map<std::string, std::string> _shader_program_map; /**< Map that associates kernel names with programs. */
+    static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs.
+                                                                              Used for compile-time shader inclusion. */
+};
+}
+#endif /* __ARM_COMPUTE_GCKERNELLIBRARY_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/GCKernels.h b/arm_compute/core/GLES_COMPUTE/GCKernels.h
new file mode 100644
index 000000000..417c98af6
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/GCKernels.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCKERNELS_H__
+#define __ARM_COMPUTE_GCKERNELS_H__
+
+/* Header regrouping all the GLES compute kernels */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h"
+
+#endif /* __ARM_COMPUTE_GCKERNELS_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/IGCKernel.h b/arm_compute/core/GLES_COMPUTE/IGCKernel.h
new file mode 100644
index 000000000..11b2b17e5
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/IGCKernel.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IGCKERNEL_H__
+#define __ARM_COMPUTE_IGCKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+
+#include "arm_compute/core/IKernel.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+class Window;
+
+/** Common interface for all the GLES kernels */
+class IGCKernel : public IKernel
+{
+public:
+    /** Constructor */
+    IGCKernel();
+    /** Returns a reference to the GLES kernel of this object.
+     *
+     * @return A reference to the GLES kernel of this object.
+     */
+    GCKernel &kernel();
+
+    class BufferParam
+    {
+    public:
+        /** Tensor's binding point in this kernel. */
+        unsigned int binding_point = 0;
+        /** The base 2 logarithm of SSBO buffer data type size (Number of bits to be shift for offset calculation) */
+        unsigned int buffer_data_type_shift = 0;
+
+        /** Constructor
+         *
+         * @param[in] binding Tensor's binding point.
+         * @param[in] shift   Number of bits to be shift for offset calculation
+         */
+        BufferParam(const unsigned int binding, const unsigned int shift)
+            : binding_point(binding), buffer_data_type_shift(shift)
+        {
+        }
+    };
+
+    /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in] idx           Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0.
+     * @param[in] tensor        Tensor to set as an argument of the object's kernel.
+     * @param[in] binding_point Tensor's binding point in this kernel.
+     * @param[in] window        Window the kernel will be executed on.
+     */
+    void add_1D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window);
+
+    /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in] idx    Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0.
+     * @param[in] tensor Tensor to set as an argument of the object's kernel.
+     * @param[in] param  Additional parameter for GLES SSBO buffer.
+     * @param[in] window Window the kernel will be executed on.
+     */
+    void add_1D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam &param, const Window &window);
+
+    /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in] idx           Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0.
+     * @param[in] tensor        Tensor to set as an argument of the object's kernel.
+     * @param[in] binding_point Tensor's binding point in this kernel.
+     * @param[in] window        Window the kernel will be executed on.
+     */
+    void add_2D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window);
+
+    /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in] idx    Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0.
+     * @param[in] tensor Tensor to set as an argument of the object's kernel.
+     * @param[in] param  Additional parameter for GLES SSBO buffer.
+     * @param[in] window Window the kernel will be executed on.
+     */
+    void add_2D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam &param, const Window &window);
+
+    /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in] idx           Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0.
+     * @param[in] tensor        Tensor to set as an argument of the object's kernel.
+     * @param[in] binding_point Tensor's binding point in this kernel.
+     * @param[in] window        Window the kernel will be executed on.
+     */
+    void add_3D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window);
+
+    /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in] idx    Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0.
+     * @param[in] tensor Tensor to set as an argument of the object's kernel.
+     * @param[in] param  Additional parameter for GLES SSBO buffer.
+     * @param[in] window Window the kernel will be executed on.
+     */
+    void add_3D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam &param, const Window &window);
+
+    /** Returns the number of arguments enqueued per 1D tensor object.
+     *
+     * @return The number of arguments enqueues per 1D tensor object.
+     */
+    unsigned int num_arguments_per_1D_tensor() const;
+    /** Returns the number of arguments enqueued per 2D tensor object.
+     *
+     * @return The number of arguments enqueues per 2D tensor object.
+     */
+    unsigned int num_arguments_per_2D_tensor() const;
+    /** Returns the number of arguments enqueued per 3D tensor object.
+     *
+     * @return The number of arguments enqueues per 3D tensor object.
+     */
+    unsigned int num_arguments_per_3D_tensor() const;
+    /** Enqueue the OpenGL ES shader to process the given window
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    virtual void run(const Window &window) = 0;
+
+private:
+    /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in] idx    Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0.
+     * @param[in] tensor Tensor to set as an argument of the object's kernel.
+     * @param[in] param  Additional parameter for GLES SSBO buffer.
+     * @param[in] window Window the kernel will be executed on.
+     */
+    template <unsigned int dimension_size>
+    void add_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam &param, const Window &window);
+
+    /** Returns the number of arguments enqueued per tensor object.
+     *
+     * @return The number of arguments enqueued per tensor object.
+     */
+    template <unsigned int dimension_size>
+    unsigned int           num_arguments_per_tensor() const;
+
+protected:
+    GCKernel _kernel; /**< GLES kernel to run */
+};
+
+/** Add the kernel to the command queue with the given window.
+ *
+ * @note Depending on the size of the window, this might translate into several jobs being enqueued.
+ *
+ * @note If kernel->kernel() is empty then the function will return without adding anything to the queue.
+ *
+ * @param[in] kernel Kernel to enqueue
+ * @param[in] window Window the kernel has to process.
+ * @param[in] lws    Local workgroup size requested, by default (1, 1, 1)
+ *
+ * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
+ */
+void enqueue(IGCKernel &kernel, const Window &window, const gles::NDRange &lws = gles::NDRange(1U, 1U, 1U));
+}
+#endif /*__ARM_COMPUTE_IGCKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h b/arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h
new file mode 100644
index 000000000..413e86a2b
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IGCSIMPLE2DKERNEL_H__
+#define __ARM_COMPUTE_IGCSIMPLE2DKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Interface for simple OpenGL ES kernels having 1 tensor input and 1 tensor output. This interface can be used when the work-item processes a 2D tile */
+class IGCSimple2DKernel : public IGCSimpleKernel
+{
+public:
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+};
+}
+#endif /*__ARM_COMPUTE_IGCSIMPLE2DKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h b/arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h
new file mode 100644
index 000000000..622e53c38
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IGCSIMPLE3DKERNEL_H__
+#define __ARM_COMPUTE_IGCSIMPLE3DKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Interface for simple GLES kernels having 1 tensor input and 1 tensor output.
+ *  Both input tensor and output tensor must have at least 3 dimensions.
+ */
+class IGCSimple3DKernel : public IGCSimple2DKernel
+{
+public:
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+};
+}
+#endif /*__ARM_COMPUTE_IGCSIMPLE3DKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h b/arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h
new file mode 100644
index 000000000..a23c4e774
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IGCSIMPLEKERNEL_H__
+#define __ARM_COMPUTE_IGCSIMPLEKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+/** Interface for simple OpenGL ES kernels having 1 tensor input and 1 tensor output */
+class IGCSimpleKernel : public IGCKernel
+{
+public:
+    /** Constructor. */
+    IGCSimpleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    IGCSimpleKernel(const IGCSimpleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    IGCSimpleKernel &operator=(const IGCSimpleKernel &) = delete;
+    /** Allow instances of this class to be moved. */
+    IGCSimpleKernel(IGCSimpleKernel &&) = default;
+    /** Allow instances of this class to be moved. */
+    IGCSimpleKernel &operator=(IGCSimpleKernel &&) = default;
+    /** Default destructor */
+    ~IGCSimpleKernel() = default;
+
+    /** Configure the kernel
+     *
+     * @param[in]  input                             Source tensor.
+     * @param[out] output                            Destination tensor.
+     * @param[in]  num_elems_processed_per_iteration Number of processed elements per iteration.
+     * @param[in]  border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
+     * @param[in]  border_size                       (Optional) Size of the border.
+     */
+    void configure(const IGCTensor *input, IGCTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize());
+
+protected:
+    const IGCTensor *_input;
+    IGCTensor       *_output;
+};
+}
+
+#endif /*__ARM_COMPUTE_IGCSIMPLEKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/IGCTensor.h b/arm_compute/core/GLES_COMPUTE/IGCTensor.h
new file mode 100644
index 000000000..ab4e57e0c
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/IGCTensor.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IGCTENSOR_H__
+#define __ARM_COMPUTE_IGCTENSOR_H__
+
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/ITensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Interface for GLES Compute tensor */
+class IGCTensor : public ITensor
+{
+public:
+    /** Default constructor. */
+    IGCTensor();
+
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    IGCTensor(const IGCTensor &) = delete;
+
+    /** Prevent instances of this class from being copy assigned (As this class contains pointers). */
+    IGCTensor &operator=(const IGCTensor &) = delete;
+
+    /** Allow instances of this class to be moved */
+    IGCTensor(IGCTensor &&) = default;
+
+    /** Allow instances of this class to be moved */
+    IGCTensor &operator=(IGCTensor &&) = default;
+
+    /** Virtual destructor */
+    virtual ~IGCTensor() = default;
+
+    /** Map on an allocated buffer.
+     *
+     * @param[in] blocking (Optional) If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     */
+    void map(bool blocking = true);
+    /** Unmap an allocated and mapped buffer.
+     */
+    void unmap();
+    /** Clear the contents of the tensor synchronously.
+     */
+    void clear();
+
+    // Inherited methods overridden:
+    uint8_t *buffer() const override;
+    /** Interface to be implemented by the child class to return the tensor's gles compute buffer id.
+      *
+      * @return A SSBO buffer id.
+     */
+    virtual GLuint gc_buffer() const = 0;
+
+protected:
+    /** Method to be implemented by the child class to map the SSBO.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     */
+    virtual uint8_t *do_map(bool blocking) = 0;
+    /** Method to be implemented by the child class to unmap the SSBO.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     */
+    virtual void do_unmap() = 0;
+
+private:
+    uint8_t *_mapping;
+};
+
+using IGCImage = IGCTensor;
+}
+#endif /*__ARM_COMPUTE_IGCTENSOR_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/OpenGLES.h b/arm_compute/core/GLES_COMPUTE/OpenGLES.h
new file mode 100644
index 000000000..e12398294
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/OpenGLES.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_OPENGLES_H__
+#define __ARM_COMPUTE_OPENGLES_H__
+
+#include "arm_compute/core/Log.h"
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#include <EGL/eglplatform.h>
+#include <GLES3/gl31.h>
+#include <GLES3/gl3ext.h>
+#include <cstddef>
+
+#ifdef ARM_COMPUTE_DEBUG_ENABLED
+#define ARM_COMPUTE_GL_CHECK(x)                                                                      \
+    x;                                                                                               \
+    {                                                                                                \
+        GLenum error = glGetError();                                                                 \
+        if(error != GL_NO_ERROR)                                                                     \
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("glGetError() = %i (0x%.8x)\n", error, error); \
+    }
+#else /* ARM_COMPUTE_DEBUG_ENABLED */
+#define ARM_COMPUTE_GL_CHECK(x) x
+#endif /* ARM_COMPUTE_DEBUG_ENABLED */
+
+namespace arm_compute
+{
+namespace gles
+{
+/** Class interface for specifying NDRange values. */
+class NDRange
+{
+private:
+    size_t _sizes[3];
+    size_t _dimensions;
+
+public:
+    /** Default constructor - resulting range has zero dimensions. */
+    NDRange()
+        : _dimensions(0)
+    {
+        _sizes[0] = 0;
+        _sizes[1] = 0;
+        _sizes[2] = 0;
+    }
+
+    /** Constructs one-dimensional range.
+     *
+     * @param[in] size0 Size of the first dimension.
+     */
+    NDRange(size_t size0)
+        : _dimensions(1)
+    {
+        _sizes[0] = size0;
+        _sizes[1] = 1;
+        _sizes[2] = 1;
+    }
+
+    /** Constructs two-dimensional range.
+     *
+     * @param[in] size0 Size of the first dimension.
+     * @param[in] size1 Size of the second dimension.
+     */
+    NDRange(size_t size0, size_t size1)
+        : _dimensions(2)
+    {
+        _sizes[0] = size0;
+        _sizes[1] = size1;
+        _sizes[2] = 1;
+    }
+
+    /** Constructs three-dimensional range.
+     *
+     * @param[in] size0 Size of the first dimension.
+     * @param[in] size1 Size of the second dimension.
+     * @param[in] size2 Size of the third dimension.
+     */
+    NDRange(size_t size0, size_t size1, size_t size2)
+        : _dimensions(3)
+    {
+        _sizes[0] = size0;
+        _sizes[1] = size1;
+        _sizes[2] = size2;
+    }
+
+    /** Conversion operator to const size_t *.
+     *
+     *  @returns A pointer to the size of the first dimension.
+     */
+    operator const size_t *() const
+    {
+        return _sizes;
+    }
+
+    /** Queries the number of dimensions in the range.
+     *
+     * @returns The number of dimensions.
+    */
+    size_t dimensions() const
+    {
+        return _dimensions;
+    }
+
+    /** Returns the size of the object in bytes based on the runtime number of dimensions
+     *
+     * @returns The size of the object in bytes.
+     */
+    size_t size() const
+    {
+        return _dimensions * sizeof(size_t);
+    }
+
+    /** Returns the sizes array for each dimensions.
+     *
+     * @returns The sizes array
+     */
+    size_t *get()
+    {
+        return _sizes;
+    }
+
+    /** Returns the sizes array for each dimensions.
+     *
+     * @returns The sizes array
+     */
+    const size_t *get() const
+    {
+        return _sizes;
+    }
+};
+
+static const NDRange NullRange;
+static const NDRange Range_128_1 = NDRange(128, 1);
+} // namespace gles
+
+/** Check if the OpenGL ES 3.1 API is available at runtime.
+ *
+ *  @returns true if the OpenGL ES 3.1 API is available.
+ */
+bool opengles31_is_available();
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_OPENGLES_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h
new file mode 100644
index 000000000..71f7b3770
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCABSOLUTEDIFFERENCEKERNEL_H__
+#define __ARM_COMPUTE_GCABSOLUTEDIFFERENCEKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Interface for the absolute difference kernel.
+ *
+ * Absolute difference is computed by:
+ * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f]
+ */
+class GCAbsoluteDifferenceKernel : public IGCKernel
+{
+public:
+    /** Default constructor. */
+    GCAbsoluteDifferenceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    GCAbsoluteDifferenceKernel(const GCAbsoluteDifferenceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    GCAbsoluteDifferenceKernel &operator=(const GCAbsoluteDifferenceKernel &) = delete;
+    /** Allow instances of this class to be moved. */
+    GCAbsoluteDifferenceKernel(GCAbsoluteDifferenceKernel &&) = default;
+    /** Allow instances of this class to be moved. */
+    GCAbsoluteDifferenceKernel &operator=(GCAbsoluteDifferenceKernel &&) = default;
+    /** Default destructor */
+    ~GCAbsoluteDifferenceKernel() = default;
+
+    /** Set the inputs and output images.
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8
+     * @param[in]  input2 Source tensor. Data types supported: U8
+     * @param[out] output Destination tensor. Data types supported: U8
+     */
+    void configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const IGCTensor *_input1; /**< Source tensor 1. */
+    const IGCTensor *_input2; /**< Source tensor 2. */
+    IGCTensor       *_output; /**< Destination tensor. */
+};
+}
+#endif /* __ARM_COMPUTE_GCABSOLUTEDIFFERENCEKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h
new file mode 100644
index 000000000..fc1d52f45
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCACTIVATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_GCACTIVATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Interface for the activation layer kernel. */
+class GCActivationLayerKernel : public IGCKernel
+{
+public:
+    /** Default constructor */
+    GCActivationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCActivationLayerKernel(const GCActivationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCActivationLayerKernel &operator=(const GCActivationLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    GCActivationLayerKernel(GCActivationLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    GCActivationLayerKernel &operator=(GCActivationLayerKernel &&) = default;
+    /** Default destructor */
+    ~GCActivationLayerKernel() = default;
+    /** Set the input and output tensor.
+     *
+     * @note If the output tensor is a nullptr, the activation function will be performed in-place
+     *
+     * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
+     *                          of the activation function. Data types supported: F16/F32.
+     * @param[out]     output   Destination tensor. Data type should match the input data type.
+     * @param[in]      act_info Activation layer information.
+     */
+    void configure(IGCTensor *input, IGCTensor *output, ActivationLayerInfo act_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    IGCTensor *_input;
+    IGCTensor *_output;
+};
+}
+#endif /*__ARM_COMPUTE_GCACTIVATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h
new file mode 100644
index 000000000..2bbd6a83f
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCBATCHNORMALIZATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_GCBATCHNORMALIZATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Interface for the BatchNormalization layer kernel.
+ */
+class GCBatchNormalizationLayerKernel : public IGCKernel
+{
+public:
+    /** Constructor */
+    GCBatchNormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCBatchNormalizationLayerKernel(const GCBatchNormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCBatchNormalizationLayerKernel &operator=(const GCBatchNormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    GCBatchNormalizationLayerKernel(GCBatchNormalizationLayerKernel &&) = default;
+    /** Default move assignment operator. */
+    GCBatchNormalizationLayerKernel &operator=(GCBatchNormalizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~GCBatchNormalizationLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                     The rest are optional and used for representing batches. Data types supported: F16/F32.
+     * @param[out] output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     * @param[in]  mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  epsilon Small value to avoid division with zero.
+     */
+    void configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma, float epsilon);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const IGCTensor *_input;
+    IGCTensor       *_output;
+    const IGCTensor *_mean;
+    const IGCTensor *_var;
+    const IGCTensor *_beta;
+    const IGCTensor *_gamma;
+    float            _epsilon;
+};
+}
+#endif /*__ARM_COMPUTE_GCBATCHNORMALIZATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h
new file mode 100644
index 000000000..257ab0eca
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_GCCOL2IMKERNEL_H__
+#define __ARM_COMPUTE_GCCOL2IMKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Interface for the col2im reshaping kernel.
+ *
+ * Rearranges each matrix column into image blocks. It's the inverse operation of @ref GCIm2ColKernel.
+ *
+ * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
+ *
+ * @f[
+ * \left( \begin{array}{ccccccccc}
+ * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccc}
+ * a0 & a1 & a2 \\
+ * a3 & a4 & a5 \\
+ * a6 & a7 & a8 \\
+ * \end{array} \right)
+ * @f]
+ */
+class GCCol2ImKernel : public IGCKernel
+{
+public:
+    /** Default constructor */
+    GCCol2ImKernel();
+
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCCol2ImKernel(const GCCol2ImKernel &) = delete;
+
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCCol2ImKernel &operator=(const GCCol2ImKernel &) = delete;
+
+    /** Allow instances of this class to be moved */
+    GCCol2ImKernel(GCCol2ImKernel &&) = default;
+
+    /** Allow instances of this class to be moved */
+    GCCol2ImKernel &operator=(GCCol2ImKernel &&) = default;
+
+    /** Default destructor */
+    ~GCCol2ImKernel() = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input          The input tensor to convert. Data types supported: F32
+     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
+     *                            while the rest represent batch of outputs. Data types supported: Same as @p input
+     * @param[in]  convolved_dims Output convolved dimensions.
+     */
+    void configure(const IGCTensor *input, IGCTensor *output, std::pair<unsigned int, unsigned int> convolved_dims);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const IGCTensor *_input;
+    IGCTensor       *_output;
+    std::pair<unsigned int, unsigned int> _convolved_dims;
+};
+}
+
+#endif /*__ARM_COMPUTE_GCCOL2IMKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h
new file mode 100644
index 000000000..ce220cc56
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_GCDEPTHCONCATENATEKERNEL_H__
+#define __ARM_COMPUTE_GCDEPTHCONCATENATEKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Interface for the depth concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class GCDepthConcatenateLayerKernel : public IGCKernel
+{
+public:
+    /** Default constructor */
+    GCDepthConcatenateLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCDepthConcatenateLayerKernel(const GCDepthConcatenateLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCDepthConcatenateLayerKernel &operator=(const GCDepthConcatenateLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    GCDepthConcatenateLayerKernel(GCDepthConcatenateLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    GCDepthConcatenateLayerKernel &operator=(GCDepthConcatenateLayerKernel &&) = default;
+    /** Default destructor */
+    ~GCDepthConcatenateLayerKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]     input        Input tensor. Data types supported: F16/F32.
+     * @param[in]     depth_offset The offset on the Z axis.
+     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
+     *
+     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
+     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
+     *
+     */
+    void configure(const IGCTensor *input, unsigned int depth_offset, IGCTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const IGCTensor *_input;
+    IGCTensor       *_output;
+    int              _top_bottom;
+    int              _left_right;
+};
+}
+#endif /* __ARM_COMPUTE_GCDEPTHCONCATENATEKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h
new file mode 100644
index 000000000..415b781bc
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYERKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Interface for the direct convolution kernel.
+ */
+template <unsigned int kernel_size>
+class GCDirectConvolutionLayerKernel : public IGCKernel
+{
+public:
+    /** Default constructor */
+    GCDirectConvolutionLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCDirectConvolutionLayerKernel(const GCDirectConvolutionLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCDirectConvolutionLayerKernel &operator=(const GCDirectConvolutionLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    GCDirectConvolutionLayerKernel(GCDirectConvolutionLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    GCDirectConvolutionLayerKernel &operator=(GCDirectConvolutionLayerKernel &&) = default;
+    /** Default destructor */
+    ~GCDirectConvolutionLayerKernel() = default;
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input     The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32
+     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in]  bias      Biases tensor. Shared bias supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
+     * @param[out] output    The output tensor. First 2 lower dimensions represent a transform of each 3D input,
+     *                       while every dimension above represents a batch. Data types supported: Same as @p input
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const IGCTensor *_input;
+    const IGCTensor *_bias;
+    const IGCTensor *_weights;
+    IGCTensor       *_output;
+    BorderSize       _border_size;
+    int              _conv_stride_x;
+    int              _conv_stride_y;
+    int              _conv_pad_x;
+    int              _conv_pad_y;
+    gles::NDRange    _lws;
+};
+
+using GCDirectConvolutionLayer1x1Kernel = GCDirectConvolutionLayerKernel<1>;
+using GCDirectConvolutionLayer3x3Kernel = GCDirectConvolutionLayerKernel<3>;
+using GCDirectConvolutionLayer5x5Kernel = GCDirectConvolutionLayerKernel<5>;
+}
+#endif /*__ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h
new file mode 100644
index 000000000..9f04411d9
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_GCDROPOUTLAYERKERNEL_H__
+#define __ARM_COMPUTE_GCDROPOUTLAYERKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Interface for the dropout layer kernel.
+ *
+ * Dropout is used to improve over-fit on neural networks.
+ *
+ */
+class GCDropoutLayerKernel : public IGCKernel
+{
+public:
+    /** Default constructor */
+    GCDropoutLayerKernel();
+
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCDropoutLayerKernel(const GCDropoutLayerKernel &) = delete;
+
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCDropoutLayerKernel &operator=(const GCDropoutLayerKernel &) = delete;
+
+    /** Allow instances of this class to be moved */
+    GCDropoutLayerKernel(GCDropoutLayerKernel &&) = default;
+
+    /** Allow instances of this class to be moved */
+    GCDropoutLayerKernel &operator=(GCDropoutLayerKernel &&) = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input   The input tensor for this op. Data types supported: F16/F32
+     * @param[out] mask    The mask tensor. Data types supported: Same as @p input
+     * @param[out] output  The output tensor. Data types supported: Same as @p input
+     * @param[in]  ratio   Dropout ratio
+     * @param[in]  forward Forward or backward propagation
+     *
+     */
+    void configure(const IGCTensor *input, IGCTensor *mask, IGCTensor *output, float ratio, bool forward);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const IGCTensor *_input;
+    IGCTensor       *_mask;
+    IGCTensor       *_output;
+    unsigned int     _num_elems_processed_per_iteration;
+};
+}
+
+#endif /*__ARM_COMPUTE_GCDROPOUTLAYERKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h
new file mode 100644
index 000000000..acb8aa67d
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCFILLBORDERKERNEL_H__
+#define __ARM_COMPUTE_GCFILLBORDERKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Interface for filling the border of a kernel */
+class GCFillBorderKernel : public IGCKernel
+{
+public:
+    /** Default constructor */
+    GCFillBorderKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCFillBorderKernel(const GCFillBorderKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCFillBorderKernel &operator=(const GCFillBorderKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    GCFillBorderKernel(GCFillBorderKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    GCFillBorderKernel &operator=(GCFillBorderKernel &&) = default;
+    /** Default destructor */
+    ~GCFillBorderKernel() = default;
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in,out] tensor                Tensor to process Data types supported: F16/F32.
+     * @param[in]     border_size           Size of the border to fill in elements.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(const IGCTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+
+    /** Function to set the constant value on fill border kernel depending on type.
+     *
+     * @param[in] idx                   Index of the kernel argument to set.
+     * @param[in] constant_border_value Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    template <class T>
+    void set_constant_border(unsigned int idx, const PixelValue &constant_border_value);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    bool is_parallelisable() const override;
+
+private:
+    const IGCTensor *_tensor;
+};
+}
+#endif /*__ARM_COMPUTE_GCFILLBORDERKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h
new file mode 100644
index 000000000..b2369a6ad
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCGEMMINTERLEAVE4X4KERNEL_H__
+#define __ARM_COMPUTE_GCGEMMINTERLEAVE4X4KERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** OpenGL ES kernel which interleaves the elements of a matrix A in chunk of 4x4
+ *
+ * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccccccccccc}
+ * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\
+ * \end{array} \right)
+ * @f]
+ *
+ * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
+ */
+class GCGEMMInterleave4x4Kernel : public IGCKernel
+{
+public:
+    /** Default constructor */
+    GCGEMMInterleave4x4Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCGEMMInterleave4x4Kernel(const GCGEMMInterleave4x4Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCGEMMInterleave4x4Kernel &operator=(const GCGEMMInterleave4x4Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    GCGEMMInterleave4x4Kernel(GCGEMMInterleave4x4Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    GCGEMMInterleave4x4Kernel &operator=(GCGEMMInterleave4x4Kernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: F32
+     * @param[out] output Output tensor. Data type supported: same as @p input
+     */
+    void configure(const IGCTensor *input, IGCTensor *output);
+
+    // Inherited methods overridden
+    void run(const Window &window) override;
+
+private:
+    const IGCTensor *_input;
+    IGCTensor       *_output;
+};
+}
+#endif /* __ARM_COMPUTE_GCGEMMINTERLEAVE4X4KERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h
new file mode 100644
index 000000000..77a52b2aa
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCGEMMMATRIXACCUMULATEBIASESKERNEL_H__
+#define __ARM_COMPUTE_GCGEMMMATRIXACCUMULATEBIASESKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+
+namespace arm_compute
+{
+/** Interface to add a bias to each row of the input tensor
+ *
+ */
+class GCGEMMMatrixAccumulateBiasesKernel : public IGCKernel
+{
+public:
+    /** Default constructor */
+    GCGEMMMatrixAccumulateBiasesKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCGEMMMatrixAccumulateBiasesKernel(const GCGEMMMatrixAccumulateBiasesKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCGEMMMatrixAccumulateBiasesKernel &operator=(const GCGEMMMatrixAccumulateBiasesKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    GCGEMMMatrixAccumulateBiasesKernel(GCGEMMMatrixAccumulateBiasesKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    GCGEMMMatrixAccumulateBiasesKernel &operator=(GCGEMMMatrixAccumulateBiasesKernel &&) = default;
+    /** Set the accumulate buffer and the biases of the kernel.
+     *
+     * @param[in, out] accum  The accumulate tensor to convert. Data types supported: F16/F32
+     * @param[in]      biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
+     */
+    void configure(IGCTensor *accum, const IGCTensor *biases);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    IGCTensor       *_accum;
+    const IGCTensor *_biases;
+    gles::NDRange    _lws;
+};
+}
+
+#endif /*__ARM_COMPUTE_GCGEMMMATRIXACCUMULATEBIASESKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h
new file mode 100644
index 000000000..02abb8da7
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCGEMMMATRIXADDITIONKERNEL_H__
+#define __ARM_COMPUTE_GCGEMMMATRIXADDITIONKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** OpenGL ES kernel to perform the in-place matrix addition between 2 matrices, taking into account that the second matrix might be weighted by a scalar value beta.
+ *  The matrices must have the same dimensions
+ *
+ * @note This kernel is computed if and only if beta != 0.0.
+ */
+class GCGEMMMatrixAdditionKernel : public IGCKernel
+{
+public:
+    /** Default constructor */
+    GCGEMMMatrixAdditionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCGEMMMatrixAdditionKernel(const GCGEMMMatrixAdditionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCGEMMMatrixAdditionKernel &operator=(const GCGEMMMatrixAdditionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    GCGEMMMatrixAdditionKernel(GCGEMMMatrixAdditionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    GCGEMMMatrixAdditionKernel &operator=(GCGEMMMatrixAdditionKernel &&) = default;
+    /** Initialise the kernel's input, output and beta value
+     *
+     * @note The input and output tensors must have the same dimensions
+     *
+     * @param[in]      input  Input tensor (Matrix C). Data types supported: F32
+     * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result (alpha * AB + beta * C), output must contain the result obtained by @ref GCGEMMMatrixMultiplyKernel. Data type supported: same as @p input
+     * @param[in]      beta   Weight of matrix C
+     */
+    void configure(const IGCTensor *input, IGCTensor *output, float beta);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const IGCTensor *_input;
+    IGCTensor       *_output;
+};
+}
+
+#endif /* __ARM_COMPUTE_GCGEMMMATRIXADDITIONKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h
new file mode 100644
index 000000000..3a0b22f14
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCGEMMMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_GCGEMMMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** GLES Compute kernel to multiply two input matrices "A" and "B" or to multiply a vector "A" by a matrix "B". All elements of the output matrix/vector will be multiplied by alpha
+ *
+ * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p input0 and @p input1 are both matrices and reshaped respectively with @ref GCGEMMInterleave4x4Kernel" and @ref GCGEMMTranspose1xWKernel
+ * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p input0 is a vector and the second input tensor @p input1 a matrix. The implementation also assumes that both tensors have not been reshaped
+ *
+ * @attention The second input tensor must have at least 2 dimensions (matrix)
+ *
+ */
+class GCGEMMMatrixMultiplyKernel : public IGCKernel
+{
+public:
+    /** Default constructor */
+    GCGEMMMatrixMultiplyKernel();
+
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCGEMMMatrixMultiplyKernel(const GCGEMMMatrixMultiplyKernel &) = delete;
+
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCGEMMMatrixMultiplyKernel &operator=(const GCGEMMMatrixMultiplyKernel &) = delete;
+
+    /** Allow instances of this class to be moved */
+    GCGEMMMatrixMultiplyKernel(GCGEMMMatrixMultiplyKernel &&) = default;
+
+    /** Allow instances of this class to be moved */
+    GCGEMMMatrixMultiplyKernel &operator=(GCGEMMMatrixMultiplyKernel &&) = default;
+
+    /** Initialise the kernel's input, output and alpha
+     *
+     * @param[in]  input0                    Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
+     * @param[in]  input1                    Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
+     *                                       If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
+     * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     * @param[in]  alpha                     Weight of the matrix product
+     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref GCGEMMInterleave4x4Kernel and @ref GCGEMMTranspose1xWKernel
+     */
+    void configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed = true);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const IGCTensor *_input0;
+    const IGCTensor *_input1;
+    IGCTensor       *_output;
+};
+}
+#endif /* __ARM_COMPUTE_GCGEMMMATRIXMULTIPLYKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h
new file mode 100644
index 000000000..4223556ac
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCGEMMTRANSPOSE1XWKERNEL_H__
+#define __ARM_COMPUTE_GCGEMMTRANSPOSE1XWKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** OpenGLES kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor)
+ *
+ * Following an example of how the transposition1xW works when the input data type is F32
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccccccccccc}
+ * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ *
+ * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+ *
+ */
+class GCGEMMTranspose1xWKernel : public IGCSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: F32
+     * @param[out] output Output tensor. Data type supported: same as @p input
+     */
+    void configure(const IGCTensor *input, IGCTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+};
+}
+#endif /* __ARM_COMPUTE_GCGEMMTRANSPOSE1XWKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h
new file mode 100644
index 000000000..e1b35607f
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCIM2COLKERNEL_H__
+#define __ARM_COMPUTE_GCIM2COLKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Interface for the im2col reshape kernel.
+ *
+ * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
+ * It is used to transform a convolution to a plain matrix multiplication.
+ *
+ * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * =
+ * \left( \begin{array}{ccccccccc}
+ * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
+ * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
+ * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
+ * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ */
+class GCIm2ColKernel : public IGCKernel
+{
+public:
+    /** Default constructor */
+    GCIm2ColKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCIm2ColKernel(const GCIm2ColKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCIm2ColKernel &operator=(const GCIm2ColKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    GCIm2ColKernel(GCIm2ColKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    GCIm2ColKernel &operator=(GCIm2ColKernel &&) = default;
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F32
+     * @param[out] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
+     *                         while every dimension above represents a batch. Data types supported: Same as @p input
+     * @param[in]  kernel_dims The kernel dimensions (width and height).
+     * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
+     */
+    void configure(const IGCTensor *input, IGCTensor *output, std::pair<unsigned int, unsigned int> kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Run the reshape kernel optimised for the special case (stride is 1, padding is 0 and kernel's low 3 dimensions are same as input)
+     *
+     * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     * @param[in,out] queue  Command queue on which to enqueue the kernel.
+     */
+    void run_reduced(const Window &window);
+    /** run the generic convolution layer input reshape kernel
+     *
+     * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     * @param[in,out] queue  Command queue on which to enqueue the kernel.
+     */
+    void run_generic(const Window &window);
+
+    /** Common signature for the kernel to run */
+    using Im2ColFunction = void (GCIm2ColKernel::*)(const Window &);
+
+private:
+    const IGCTensor *_input;
+    IGCTensor       *_output;
+    std::pair<unsigned int, unsigned int> _convolved_dims;
+    unsigned int   _num_elems_processed_per_iteration;
+    Im2ColFunction _run_func;
+};
+}
+
+#endif /*__ARM_COMPUTE_GCIM2COLKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h
new file mode 100644
index 000000000..e8bc7ad2b
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCNORMALIZATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_GCNORMALIZATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Interface for the normalization layer kernel.
+ */
+class GCNormalizationLayerKernel : public IGCKernel
+{
+public:
+    /** Constructor */
+    GCNormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCNormalizationLayerKernel(const GCNormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCNormalizationLayerKernel &operator=(const GCNormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    GCNormalizationLayerKernel(GCNormalizationLayerKernel &&) = default;
+    /** Default move assignment operator. */
+    GCNormalizationLayerKernel &operator=(GCNormalizationLayerKernel &&) = default;
+    /** Default destrutor */
+    ~GCNormalizationLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                           and an optional 4th dimension for batch of inputs. Data types supported: F32.
+     * @param[in]  squared_input Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                           Data types should match the input type.
+     * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data types should match the input type.
+     * @param[in]  norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
+     */
+    void configure(const IGCTensor *input, const IGCTensor *squared_input, IGCTensor *output, NormalizationLayerInfo norm_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const IGCTensor *_input;
+    const IGCTensor *_squared_input;
+    IGCTensor       *_output;
+    BorderSize       _border_size;
+};
+}
+#endif /*__ARM_COMPUTE_GCNORMALIZATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h
new file mode 100644
index 000000000..3b01b4ad4
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCPIXELWISEMULTIPLICATIONKERNEL_H__
+#define __ARM_COMPUTE_GCPIXELWISEMULTIPLICATIONKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Interface for the pixelwise multiplication kernel.
+ *
+ */
+class GCPixelWiseMultiplicationKernel : public IGCKernel
+{
+public:
+    /** Default constructor.*/
+    GCPixelWiseMultiplicationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    GCPixelWiseMultiplicationKernel(const GCPixelWiseMultiplicationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    GCPixelWiseMultiplicationKernel &operator=(const GCPixelWiseMultiplicationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    GCPixelWiseMultiplicationKernel(GCPixelWiseMultiplicationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    GCPixelWiseMultiplicationKernel &operator=(GCPixelWiseMultiplicationKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input1 An input tensor. Data types supported: F32.
+     * @param[in]  input2 An input tensor. Data types supported: same as @p input1.
+     * @param[out] output The output tensor, Data types supported: same as @p input1.
+     * @param[in]  scale  Scale to apply after multiplication.
+     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     */
+    void configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, float scale);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const IGCTensor *_input1;
+    const IGCTensor *_input2;
+    IGCTensor       *_output;
+};
+}
+
+#endif /*__ARM_COMPUTE_GCPIXELWISEMULTIPLICATIONKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h
new file mode 100644
index 000000000..d4921c209
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCPOOLINGLAYERKERNEL_H__
+#define __ARM_COMPUTE_GCPOOLINGLAYERKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Interface for the pooling layer kernel */
+class GCPoolingLayerKernel : public IGCKernel
+{
+public:
+    /** Default constructor */
+    GCPoolingLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCPoolingLayerKernel(const GCPoolingLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCPoolingLayerKernel &operator=(const GCPoolingLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    GCPoolingLayerKernel(GCPoolingLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    GCPoolingLayerKernel &operator=(GCPoolingLayerKernel &&) = default;
+    /** Default destructor */
+    ~GCPoolingLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. Data types supported: F16/F32.
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(const IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const IGCTensor *_input;
+    IGCTensor       *_output;
+    PoolingLayerInfo _pool_info;
+    BorderSize       _border_size;
+    unsigned int     _num_elems_processed_per_iteration;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_GCPOOLINGLAYERKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h
new file mode 100644
index 000000000..483e19b21
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCSOFTMAXLAYERKERNEL_H__
+#define __ARM_COMPUTE_GCSOFTMAXLAYERKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Interface for the identifying the max value of 1D Logits */
+class GCLogits1DMaxKernel : public IGCSimple3DKernel
+{
+public:
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: F16/F32
+     * @param[out] output Destination tensor. Data types supported: same as @p input
+     */
+    void configure(const IGCTensor *input, IGCTensor *output);
+};
+
+/** Interface for shifting the logits values around the max value and exponentiating the result */
+class GCLogits1DShiftExpSumKernel : public IGCKernel
+{
+public:
+    /** Default constructor */
+    GCLogits1DShiftExpSumKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCLogits1DShiftExpSumKernel(const GCLogits1DShiftExpSumKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCLogits1DShiftExpSumKernel &operator=(const GCLogits1DShiftExpSumKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    GCLogits1DShiftExpSumKernel(GCLogits1DShiftExpSumKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    GCLogits1DShiftExpSumKernel &operator=(GCLogits1DShiftExpSumKernel &&) = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: F16/F32
+     * @param[in]  max    Max values tensor. Data types supported: same as @p input
+     * @param[out] output Destination tensor. Data types supported: same as @p input
+     * @param[out] sum    Sum of 1D logits tensor. Data types supported: same as @p input
+     */
+    void configure(const IGCTensor *input, const IGCTensor *max, IGCTensor *output, IGCTensor *sum);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const IGCTensor *_input;
+    const IGCTensor *_max;
+    IGCTensor       *_output;
+    IGCTensor       *_sum;
+};
+
+/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
+class GCLogits1DNormKernel : public IGCKernel
+{
+public:
+    /** Default constructor */
+    GCLogits1DNormKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCLogits1DNormKernel(const GCLogits1DNormKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    GCLogits1DNormKernel &operator=(const GCLogits1DNormKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    GCLogits1DNormKernel(GCLogits1DNormKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    GCLogits1DNormKernel &operator=(GCLogits1DNormKernel &&) = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: F16/F32
+     * @param[in]  sum    Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
+     * @param[out] output Destination tensor. Data types supported: same as @p input
+     */
+    void configure(const IGCTensor *input, const IGCTensor *sum, IGCTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const IGCTensor *_input;
+    const IGCTensor *_sum;
+    IGCTensor       *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_GCSOFTMAXLAYERKERNEL_H__ */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h
new file mode 100644
index 000000000..c628a0058
--- /dev/null
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCTRANSPOSEKERNEL_H__
+#define __ARM_COMPUTE_GCTRANSPOSEKERNEL_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** OpenGL ES kernel which transposes the elements of a matrix.
+ *
+ * [width, height, batch] -> [height, width, batch]
+ *
+ */
+class GCTransposeKernel : public IGCSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: F16/F32
+     * @param[out] output Output tensor. Data type supported: Same as @p input
+     */
+    void configure(const IGCTensor *input, IGCTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+};
+}
+#endif /* __ARM_COMPUTE_GCTRANSPOSEKERNEL_H__ */
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index 6e4d98718..c02f14aec 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h
@@ -33,6 +33,7 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/utility.h"
 
 #include <array>
 #include <cstddef>
@@ -116,6 +117,57 @@ inline T delta_bilinear_c1(const T *pixel_ptr, size_t stride, float dx, float dy
     return static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
 }
 
+/** Computes linear interpolation using the pointer to the top pixel and the pixel's distance between
+ * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
+ *
+ * @param[in] pixel_ptr Pointer to the top pixel value of a single channel input.
+ * @param[in] stride    Stride to access the bottom pixel value
+ * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
+ *
+ * @note dy must be in the range [0, 1.0]
+ *
+ * @return The linear interpolated pixel value
+ */
+template <typename T>
+inline T delta_linear_c1_y(const T *pixel_ptr, size_t stride, float dy)
+{
+    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
+
+    const float dy1 = 1.0f - dy;
+
+    const T a00 = *pixel_ptr;
+    const T a10 = *(pixel_ptr + stride);
+
+    const float w1 = dy1;
+    const float w3 = dy;
+
+    return static_cast<T>(a00 * w1 + a10 * w3);
+}
+/** Computes linear interpolation using the pointer to the left pixel and the pixel's distance between
+ * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
+ *
+ * @param[in] pixel_ptr Pointer to the left pixel value of a single channel input.
+ * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
+ *
+ * @note dx must be in the range [0, 1.0]
+ *
+ * @return The linear interpolated pixel value
+ */
+template <typename T>
+inline T delta_linear_c1_x(const T *pixel_ptr, float dx)
+{
+    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
+
+    const T a00 = *pixel_ptr;
+    const T a01 = *(pixel_ptr + 1);
+
+    const float dx1 = 1.0f - dx;
+
+    const float w1 = dx1;
+    const float w2 = dx;
+
+    return static_cast<T>(a00 * w1 + a01 * w2);
+}
 /** Return the pixel at (x,y) using bilinear interpolation.
  *
  * @warning Only works if the iterator was created with an IImage
@@ -168,6 +220,18 @@ inline uint8_t pixel_bilinear_c1_clamp(const T *first_pixel_ptr, size_t stride,
     const float dx = x - xi;
     const float dy = y - yi;
 
+    if(dx == 0.0f)
+    {
+        if(dy == 0.0f)
+        {
+            return static_cast<T>(first_pixel_ptr[static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride]);
+        }
+        return delta_linear_c1_y(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride, dy);
+    }
+    if(dy == 0.0f)
+    {
+        return delta_linear_c1_x(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, dx);
+    }
     return delta_bilinear_c1(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride, dx, dy);
 }
 
@@ -459,6 +523,23 @@ inline Strides compute_strides(const ITensorInfo &info)
     return compute_strides(info, info.element_size());
 }
 
+/** Permutes given Dimensions according to a permutation vector
+ *
+ * @warning Validity of permutation is not checked
+ *
+ * @param[in, out] dimensions Dimensions to permute
+ * @param[in]      perm       Permutation vector
+ */
+template <typename T>
+inline void permute(Dimensions<T> &dimensions, const PermutationVector &perm)
+{
+    auto copy_dimensions = utility::make_array<Dimensions<T>::num_max_dimensions>(dimensions.begin(), dimensions.end());
+    for(unsigned int i = 0; i < perm.num_dimensions(); ++i)
+    {
+        dimensions[i] = copy_dimensions[perm[i]];
+    }
+}
+
 /* Auto initialize the tensor info (shape, number of channels, data type and fixed point position) if the current assignment is empty.
  *
  * @param[in,out] info                 Tensor info used to check and assign.
@@ -466,10 +547,24 @@ inline Strides compute_strides(const ITensorInfo &info)
  * @param[in]     num_channels         New number of channels.
  * @param[in]     data_type            New data type
  * @param[in]     fixed_point_position New fixed point position
+ * @param[in]     quantization_info    (Optional) New quantization info
+ *
+ * @return True if the tensor info has been initialized
+ */
+bool auto_init_if_empty(ITensorInfo       &info,
+                        const TensorShape &shape,
+                        int num_channels, DataType data_type,
+                        int              fixed_point_position,
+                        QuantizationInfo quantization_info = QuantizationInfo());
+
+/** Auto initialize the tensor info using another tensor info.
+ *
+ * @param info_sink   Tensor info used to check and assign
+ * @param info_source Tensor info used to assign
  *
  * @return True if the tensor info has been initialized
  */
-bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, int fixed_point_position);
+bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source);
 
 /* Set the shape to the specified value if the current assignment is empty.
  *
@@ -509,6 +604,17 @@ bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type);
  * @return True if the fixed point position has been changed.
  */
 bool set_fixed_point_position_if_zero(ITensorInfo &info, int fixed_point_position);
+
+/* Set the quantization info to the specified value if
+ * the current quantization info is empty and the data type of asymmetric quantized type
+ *
+ * @param[in,out] info              Tensor info used to check and assign.
+ * @param[in]     quantization_info Quantization info
+ *
+ * @return True if the quantization info has been changed.
+ */
+bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info);
+
 /** Helper function to calculate the Valid Region for Scale.
  *
  * @param[in] src_info         Input tensor info used to check.
@@ -520,6 +626,7 @@ bool set_fixed_point_position_if_zero(ITensorInfo &info, int fixed_point_positio
  * @return The corrispondent valid region
  */
 ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape, InterpolationPolicy policy, BorderSize border_size, bool border_undefined);
+
 /** Convert a linear index into n-dimensional coordinates.
  *
  * @param[in] shape Shape of the n-dimensional tensor.
@@ -528,6 +635,7 @@ ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const Tens
  * @return n-dimensional coordinates.
  */
 inline Coordinates index2coords(const TensorShape &shape, int index);
+
 /** Convert n-dimensional coordinates into a linear index.
  *
  * @param[in] shape Shape of the n-dimensional tensor.
diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl
index de6c85ec7..367269281 100644
--- a/arm_compute/core/Helpers.inl
+++ b/arm_compute/core/Helpers.inl
@@ -197,7 +197,12 @@ inline void Iterator::reset(const size_t dimension)
     }
 }
 
-inline bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, int fixed_point_position)
+inline bool auto_init_if_empty(ITensorInfo       &info,
+                               const TensorShape &shape,
+                               int                num_channels,
+                               DataType           data_type,
+                               int                fixed_point_position,
+                               QuantizationInfo   quantization_info)
 {
     if(info.tensor_shape().total_size() == 0)
     {
@@ -205,6 +210,22 @@ inline bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int
         info.set_num_channels(num_channels);
         info.set_tensor_shape(shape);
         info.set_fixed_point_position(fixed_point_position);
+        info.set_quantization_info(quantization_info);
+        return true;
+    }
+
+    return false;
+}
+
+inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source)
+{
+    if(info_sink.tensor_shape().total_size() == 0)
+    {
+        info_sink.set_data_type(info_source.data_type());
+        info_sink.set_num_channels(info_source.num_channels());
+        info_sink.set_tensor_shape(info_source.tensor_shape());
+        info_sink.set_fixed_point_position(info_source.fixed_point_position());
+        info_sink.set_quantization_info(info_source.quantization_info());
         return true;
     }
 
@@ -255,6 +276,17 @@ inline bool set_fixed_point_position_if_zero(ITensorInfo &info, int fixed_point_
     return false;
 }
 
+inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info)
+{
+    if(info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
+    {
+        info.set_quantization_info(quantization_info);
+        return true;
+    }
+
+    return false;
+}
+
 inline ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape, InterpolationPolicy policy, BorderSize border_size, bool border_undefined)
 {
     const auto  wr = static_cast<float>(dst_shape[0]) / static_cast<float>(src_info.tensor_shape()[0]);
diff --git a/arm_compute/core/IArray.h b/arm_compute/core/IArray.h
index 960e18f3d..bc01df981 100644
--- a/arm_compute/core/IArray.h
+++ b/arm_compute/core/IArray.h
@@ -124,7 +124,7 @@ public:
     /** Resizes the array to contain "num" elements. If "num" is smaller than the maximum array size, the content is reduced to its first "num" elements,
      *  "num" elements can't be bigger than the maximum number of values which can be stored in this array.
      *
-     *  @param[in] num The new array size in number of elements
+     * @param[in] num The new array size in number of elements
      */
     void resize(size_t num)
     {
diff --git a/arm_compute/core/IMultiHOG.h b/arm_compute/core/IMultiHOG.h
index e91da7539..5e9ee3a4f 100644
--- a/arm_compute/core/IMultiHOG.h
+++ b/arm_compute/core/IMultiHOG.h
@@ -43,14 +43,14 @@ public:
     virtual size_t num_models() const = 0;
     /** Return a pointer to the requested HOG model
      *
-     *  @param[in] index The index of the wanted HOG model.
+     * @param[in] index The index of the wanted HOG model.
      *
      *  @return A pointer pointed to the HOG model
      */
     virtual IHOG *model(size_t index) = 0;
     /** Return a const pointer to the requested HOG model
      *
-     *  @param[in] index The index of the wanted HOG model.
+     * @param[in] index The index of the wanted HOG model.
      *
      *  @return A const pointer pointed to the HOG model
      */
diff --git a/arm_compute/core/IMultiImage.h b/arm_compute/core/IMultiImage.h
index 6ed3c785c..0d11c2c6b 100644
--- a/arm_compute/core/IMultiImage.h
+++ b/arm_compute/core/IMultiImage.h
@@ -43,14 +43,14 @@ public:
     virtual const MultiImageInfo *info() const = 0;
     /** Return a pointer to the requested plane of the image.
      *
-     *  @param[in] index The index of the wanted planed.
+     * @param[in] index The index of the wanted planed.
      *
      *  @return A pointer pointed to the plane
      */
     virtual IImage *plane(unsigned int index) = 0;
     /** Return a constant pointer to the requested plane of the image.
      *
-     *  @param[in] index The index of the wanted planed.
+     * @param[in] index The index of the wanted planed.
      *
      *  @return A constant pointer pointed to the plane
      */
diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
index bb3ac6e35..9a67712f3 100644
--- a/arm_compute/core/ITensorInfo.h
+++ b/arm_compute/core/ITensorInfo.h
@@ -29,13 +29,14 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ICloneable.h"
 
 #include <cstddef>
 
 namespace arm_compute
 {
 /** Store the tensor's metadata */
-class ITensorInfo
+class ITensorInfo : public misc::ICloneable<ITensorInfo>
 {
 public:
     /** Default virtual destructor */
@@ -45,15 +46,19 @@ public:
      * @warning This resets the format to UNKNOWN.
      *
      * @param[in] data_type The new data type.
+     *
+     * @return Reference to this ITensorInfo object
      */
-    virtual void set_data_type(DataType data_type) = 0;
+    virtual ITensorInfo &set_data_type(DataType data_type) = 0;
     /** Set the number of channels to the specified value.
      *
      * @warning This resets the format to UNKNOWN.
      *
      * @param[in] num_channels New number of channels.
+     *
+     * @return Reference to this ITensorInfo object
      */
-    virtual void set_num_channels(int num_channels) = 0;
+    virtual ITensorInfo &set_num_channels(int num_channels) = 0;
     /** Set the format of an already initialized tensor.
      *
      * @note If the data type has already been configured (i.e. not UNKNOWN) it
@@ -61,23 +66,41 @@ public:
      * be based on the format.
      *
      * @param[in] format Single-plane format of the tensor.
+     *
+     * @return Reference to this ITensorInfo object
      */
-    virtual void set_format(Format format) = 0;
+    virtual ITensorInfo &set_format(Format format) = 0;
     /** Set the shape of an already initialized tensor.
      *
      * @warning Changing the shape requires to recompute the strides and is
      * therefore only possible if the tensor hasn't been allocated yet.
      *
      * @param[in] shape New tensor shape.
+     *
+     * @return Reference to this ITensorInfo object
      */
-    virtual void set_tensor_shape(TensorShape shape) = 0;
+    virtual ITensorInfo &set_tensor_shape(TensorShape shape) = 0;
     /** Set the fixed point position to the specified value
      *
      * @warning The fixed point position must be set once the data type has been configured
      *
      * @param[in] fixed_point_position The new fixed point position
+     *
+     * @return Reference to this ITensorInfo object
      */
-    virtual void set_fixed_point_position(int fixed_point_position) = 0;
+    virtual ITensorInfo &set_fixed_point_position(int fixed_point_position) = 0;
+    /** Set the quantization settings (scale and offset) of the tensor.
+     *
+     * @param[in] quantization_info QuantizationInfo containing the scale and offset
+     *
+     * @return Reference to this ITensorInfo object
+     */
+    virtual ITensorInfo &set_quantization_info(QuantizationInfo quantization_info) = 0;
+    /** Resets the padding settings of the tensor.
+    *
+    * @return Reference to this ITensorInfo object
+    */
+    virtual ITensorInfo &reset_padding() = 0;
     /** Update the offset to the first element and the strides to automatically computed values.
      *
      * @note The padding used by this method is really conservative so that the tensor can be used for most functions.
@@ -178,8 +201,10 @@ public:
     /** Set the flag whether the tensor size can be changed.
      *
      * @param[in] is_resizable Flag that marks the tensor if it can be changed or not.
+     *
+     * @return Reference to this ITensorInfo object
      */
-    virtual void set_is_resizable(bool is_resizable) = 0;
+    virtual ITensorInfo &set_is_resizable(bool is_resizable) = 0;
     /** Valid region of the tensor. All elements in the valid region have defined values, i.e. are not undefined.
      *
      * @return The valid region.
@@ -190,6 +215,12 @@ public:
      * @param[in] valid_region Valid region to set.
      */
     virtual void set_valid_region(ValidRegion valid_region) = 0;
+
+    /** Get the quantization settings (scale and offset) of the tensor.
+    *
+    * @return A QuantizationInfo containing the scale and offset.
+    */
+    virtual QuantizationInfo quantization_info() const = 0;
 };
 }
 #endif /*__ARM_COMPUTE_TENSORINFO_H__ */
diff --git a/arm_compute/core/Log.h b/arm_compute/core/Log.h
new file mode 100644
index 000000000..70e7c5111
--- /dev/null
+++ b/arm_compute/core/Log.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LOG_H__
+#define __ARM_COMPUTE_LOG_H__
+
+#include "arm_compute/core/utils/logging/Macros.h"
+
+/** Create a default core logger
+ *
+ * @note It will eventually create all default loggers in don't exist
+ */
+#define ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER()                                   \
+    do                                                                             \
+    {                                                                              \
+        if(arm_compute::logging::LoggerRegistry::get().logger("CORE") == nullptr)  \
+        {                                                                          \
+            arm_compute::logging::LoggerRegistry::get().create_reserved_loggers(); \
+        }                                                                          \
+    } while(false)
+
+/** Log a message to the core system logger
+ *
+ * @param[in] log_level Logging level
+ * @param[in] msg       Message to log
+ */
+#define ARM_COMPUTE_LOG_MSG_CORE(log_level, msg)     \
+    do                                               \
+    {                                                \
+        ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();    \
+        ARM_COMPUTE_LOG_MSG("CORE", log_level, msg); \
+    } while(false)
+
+/** Log a message with format to the core system logger
+ *
+ * @param[in] log_level Logging level
+ * @param[in] fmt       String format (printf style)
+ * @param[in] ...       Message arguments
+ */
+#define ARM_COMPUTE_LOG_MSG_WITH_FORMAT_CORE(log_level, fmt, ...)             \
+    do                                                                        \
+    {                                                                         \
+        ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                             \
+        ARM_COMPUTE_LOG_MSG_WITH_FORMAT("CORE", log_level, fmt, __VA_ARGS__); \
+    } while(false)
+
+/** Log a stream to the core system logger
+ *
+ * @param[in] log_level Logging level
+ * @param[in] ss        Stream to log
+ */
+#define ARM_COMPUTE_LOG_STREAM_CORE(log_level, ss)     \
+    do                                                 \
+    {                                                  \
+        ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();      \
+        ARM_COMPUTE_LOG_STREAM("CORE", log_level, ss); \
+    } while(false)
+
+/** Log information level message to the core system logger
+ *
+ * @param[in] msg Stream to log
+ */
+#define ARM_COMPUTE_LOG_INFO_MSG_CORE(msg)                                   \
+    do                                                                       \
+    {                                                                        \
+        ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                            \
+        ARM_COMPUTE_LOG_MSG_CORE(arm_compute::logging::LogLevel::INFO, msg); \
+    } while(false)
+
+/** Log information level formatted message to the core system logger
+ *
+ * @param[in] fmt String format (printf style)
+ * @param[in] ... Message arguments
+ */
+#define ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(fmt, ...)                                           \
+    do                                                                                                \
+    {                                                                                                 \
+        ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                                                     \
+        ARM_COMPUTE_LOG_MSG_WITH_FORMAT_CORE(arm_compute::logging::LogLevel::INFO, fmt, __VA_ARGS__); \
+    } while(false)
+
+/** Log information level stream to the core system logger
+ *
+ * @param[in] ss Message to log
+ */
+#define ARM_COMPUTE_LOG_INFO_STREAM_CORE(ss)                                   \
+    do                                                                         \
+    {                                                                          \
+        ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                              \
+        ARM_COMPUTE_LOG_STREAM_CORE(arm_compute::logging::LogLevel::INFO, ss); \
+    } while(false)
+
+#endif /* __ARM_COMPUTE_LOGGING_MACROS_H__ */
diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h
new file mode 100644
index 000000000..f0d7439d4
--- /dev/null
+++ b/arm_compute/core/NEON/NEAsymm.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEASYMM_H__
+#define __ARM_COMPUTE_NEASYMM_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+using qasymm8x8_t   = uint8x8_t;   /**< 8 bit quantized asymmetric vector with 8 elements */
+using qasymm8x8x2_t = uint8x8x2_t; /**< 8 bit quantized asymmetric vector with 16 elements */
+using qasymm8x8x3_t = uint8x8x3_t; /**< 8 bit quantized asymmetric vector with 24 elements */
+using qasymm8x8x4_t = uint8x8x4_t; /**< 8 bit quantized asymmetric vector with 32 elements */
+using qasymm8x16_t  = uint8x16_t;  /**< 8 bit quantized asymmetric vector with 16 elements */
+
+/** Round to the nearest division by a power-of-two using exponent
+ *
+ * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
+ *
+ * @param[in] x        Vector of 4 elements
+ * @param[in] exponent Integer value used to round to nearest division by a power-of-two
+ *
+ * @return the nearest division by a power-of-two using exponent
+ */
+int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent);
+
+/** Perform a multiply-accumulate on all 16 components of a QASYMM8 vector
+ *
+ * vd*vs + vo
+ *
+ * @param[in] vd Input vector value in QASYMM8 format
+ * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
+ * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
+ *
+ * @return A 16-component vector in QASYMM8 format, saturated to fit
+ */
+uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo);
+} // namespace arm_compute
+#include "arm_compute/core/NEON/NEAsymm.inl"
+#endif // __ARM_COMPUTE_NEASYMM_H__
diff --git a/arm_compute/core/NEON/NEAsymm.inl b/arm_compute/core/NEON/NEAsymm.inl
new file mode 100644
index 000000000..ce999a541
--- /dev/null
+++ b/arm_compute/core/NEON/NEAsymm.inl
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+namespace arm_compute
+{
+inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent)
+{
+    const int32x4_t shift_vec  = vdupq_n_s32(-exponent);
+    const int32x4_t fixup      = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
+    const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
+    return vrshlq_s32(fixed_up_x, shift_vec);
+}
+
+inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo)
+{
+    // Convert uint8 vectors to uint16 vectors
+    const uint8x8_t vd_low        = vget_low_u8(vd);
+    const uint8x8_t vd_high       = vget_high_u8(vd);
+    uint16x8_t      vd_low_u16x8  = vmovl_u8(vd_low);
+    uint16x8_t      vd_high_u16x8 = vmovl_u8(vd_high);
+    // Convert uint16 vectors to uint32 vectors
+    uint32x4_t A_u32x4 = vmovl_u16(vget_low_u16(vd_low_u16x8));
+    uint32x4_t B_u32x4 = vmovl_u16(vget_high_u16(vd_low_u16x8));
+    uint32x4_t C_u32x4 = vmovl_u16(vget_low_u16(vd_high_u16x8));
+    uint32x4_t D_u32x4 = vmovl_u16(vget_high_u16(vd_high_u16x8));
+    // Convert uint32 vectors to float32 vectors
+    float32x4_t A_f32x4 = vcvtq_f32_u32(A_u32x4);
+    float32x4_t B_f32x4 = vcvtq_f32_u32(B_u32x4);
+    float32x4_t C_f32x4 = vcvtq_f32_u32(C_u32x4);
+    float32x4_t D_f32x4 = vcvtq_f32_u32(D_u32x4);
+    // vd = vd*vs + vo
+    A_f32x4 = vmlaq_f32(vo, A_f32x4, vs);
+    B_f32x4 = vmlaq_f32(vo, B_f32x4, vs);
+    C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
+    D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
+    // Convert float32 vectors to uint32 vectors
+    A_u32x4 = vcvtq_u32_f32(A_f32x4);
+    B_u32x4 = vcvtq_u32_f32(B_f32x4);
+    C_u32x4 = vcvtq_u32_f32(C_f32x4);
+    D_u32x4 = vcvtq_u32_f32(D_f32x4);
+    // Convert uint32 vectors to uint16 vectors (with saturation)
+    vd_low_u16x8  = vcombine_u16(vqmovn_u32(A_u32x4), vqmovn_u32(B_u32x4));
+    vd_high_u16x8 = vcombine_u16(vqmovn_u32(C_u32x4), vqmovn_u32(D_u32x4));
+    // convert uint16 vectors to uint8 vectors (with saturation)
+    return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8));
+}
+} // namespace arm_compute
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index f8579e08b..5719b6361 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -179,19 +179,19 @@ void vst1_qs16(qint16_t *addr, qint16x4_t b);
 void vst1q_qs8(qint8_t *addr, qint8x16_t b);
 
 /** Store a single 16 bit fixed point vector to memory (8 elements)
-*
-* @param[in] addr Memory address where the 16 bit fixed point vector should be stored
-* @param[in] b    16 bit fixed point vector to store
-*
-*/
+ *
+ * @param[in] addr Memory address where the 16 bit fixed point vector should be stored
+ * @param[in] b    16 bit fixed point vector to store
+ *
+ */
 void vst1q_qs16(qint16_t *addr, qint16x8_t b);
 
 /** Store two 16 bit fixed point vector to memory (8x2 elements)
-*
-* @param[in] addr Memory address where the 16 bit fixed point vectors should be stored
-* @param[in] b    16 bit fixed point vectors to store
-*
-*/
+ *
+ * @param[in] addr Memory address where the 16 bit fixed point vectors should be stored
+ * @param[in] b    16 bit fixed point vectors to store
+ *
+ */
 void vst2q_qs16(qint16_t *addr, qint16x8x2_t b);
 
 /** 16 bit fixed point vector saturating narrow (8 elements)
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
index bbb440f59..6c31fa4fb 100644
--- a/arm_compute/core/NEON/NEKernels.h
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -43,8 +43,13 @@
 #include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
 #include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
 #include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h"
 #include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
 #include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
@@ -58,10 +63,16 @@
 #include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
@@ -72,7 +83,7 @@
 #include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
 #include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
 #include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEL2NormalizeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
 #include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
@@ -101,7 +112,12 @@
 #include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
 #include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
 #include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
 #include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
 
 #endif /* __ARM_COMPUTE_NEKERNELS_H__ */
diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h
index 4c4085e54..5c60d73de 100644
--- a/arm_compute/core/NEON/NEMath.h
+++ b/arm_compute/core/NEON/NEMath.h
@@ -116,7 +116,7 @@ float32x4_t vtanhq_f32(float32x4_t val);
  */
 float32x4_t vpowq_f32(float32x4_t val, float32x4_t n);
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** Calculate hyperbolic tangent.
  *
  * tanh(x) = (e^2x - 1)/(e^2x + 1)
@@ -179,7 +179,7 @@ float16x8_t vexpq_f16(float16x8_t x);
  * @return The calculated power.
  */
 float16x8_t vpowq_f16(float16x8_t val, float16x8_t n);
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace arm_compute
 #include "arm_compute/core/NEON/NEMath.inl"
 #endif /* __ARM_COMPUTE_NEMATH_H__ */
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
index ebfc52d9a..50f217c1f 100644
--- a/arm_compute/core/NEON/NEMath.inl
+++ b/arm_compute/core/NEON/NEMath.inl
@@ -168,7 +168,7 @@ inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
 {
     return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
 }
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /* Exponent polynomial coefficients */
 const std::array<float16x8_t, 8> exp_tab_f16 =
 {
@@ -301,5 +301,5 @@ inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n)
 {
     return vexpq_f16(vmulq_f16(n, vlogq_f16(val)));
 }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
index ad8b02fbc..fa8a3be92 100644
--- a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
+++ b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
@@ -80,7 +80,7 @@ protected:
     float _alpha;
 };
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** Interface for the accumulate weighted kernel using F16 */
 class NEAccumulateWeightedFP16Kernel : public NEAccumulateWeightedKernel
 {
@@ -88,9 +88,9 @@ public:
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 };
-#else  /* ARM_COMPUTE_ENABLE_FP16 */
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 using NEAccumulateWeightedFP16Kernel = NEAccumulateWeightedKernel;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 /** Interface for the accumulate squared kernel
  *
diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
index 08fb3f915..1edda843d 100644
--- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
@@ -26,10 +26,11 @@
 
 #include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/QAsymm8.h"
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include <arm_fp16.h>
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 namespace arm_compute
 {
@@ -59,6 +60,16 @@ public:
      * @param[in]      activation_info Activation layer information.
      */
     void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayerKernel
+     *
+     * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
+     *                     of the activation function. Data types supported: QS8/QS16/F16/F32.
+     * @param[in] output   Destination tensor info. Data type supported: same as @p input
+     * @param[in] act_info Activation layer information.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -72,27 +83,33 @@ private:
     using ActivationFunctionExecutorPtr = void (NEActivationLayerKernel::*)(const Window &window);
     /** Function to apply an activation function on a tensor.
      *
-     *  @param[in] window Region on which to execute the kernel
+     * @param[in] window Region on which to execute the kernel
      */
     template <ActivationLayerInfo::ActivationFunction F, typename T>
     typename std::enable_if<std::is_same<T, float>::value, void>::type activation(const Window &window);
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     /** Function to apply an activation function on a tensor.
      *
-     *  @param[in] window Region on which to execute the kernel
+     * @param[in] window Region on which to execute the kernel
      */
     template <ActivationLayerInfo::ActivationFunction F, typename T>
     typename std::enable_if<std::is_same<T, float16_t>::value, void>::type activation(const Window &window);
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
     /** Function to apply an activation function on a tensor.
      *
-     *  @param[in] window Region on which to execute the kernel
+     * @param[in] window Region on which to execute the kernel
      */
     template <ActivationLayerInfo::ActivationFunction F, typename T>
     typename std::enable_if<std::is_same<T, qint8_t>::value, void>::type activation(const Window &window);
     /** Function to apply an activation function on a tensor.
      *
-     *  @param[in] window Region on which to execute the kernel
+     * @param[in] window Region on which to execute the kernel
+     */
+    template <ActivationLayerInfo::ActivationFunction F, typename T>
+    typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type activation(const Window &window);
+    /** Function to apply an activation function on a tensor.
+     *
+     * @param[in] window Region on which to execute the kernel
      */
     template <ActivationLayerInfo::ActivationFunction F, typename T>
     typename std::enable_if<std::is_same<T, qint16_t>::value, void>::type activation(const Window &window);
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
index edb738163..b830e022d 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
@@ -68,6 +68,16 @@ public:
      * @param[in]  policy Overflow policy.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAdditionKernel
+     *
+     * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
+     * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
+     * @param[in] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in] policy Overflow policy.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
index d6a219ffd..af81d396b 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
@@ -68,6 +68,16 @@ public:
      * @param[in]  policy Overflow policy.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtractionKernel
+     *
+     * @param[in] input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32
+     * @param[in] input2 Second tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32
+     * @param[in] output Output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
+     * @param[in] policy Policy to use to handle overflow.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index 04c4c9ebb..f3c5574e7 100644
--- a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -54,14 +54,32 @@ public:
      * @param[in, out] input   Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
      *                         3 lower dimensions represent a single input with dimensions [width, height, FM].
      *                         The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32.
+     * @param[out]     output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
      * @param[in]      mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
      * @param[in]      var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]      gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
      * @param[in]      beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]      gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
      * @param[in]      epsilon Small value to avoid division with zero.
-     * @param[out]     output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
      */
     void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayerKernel
+     *
+     * @param[in] input   Source tensor info. In case of @p output tensor = nullptr, this tensor will store the result.
+     *                    3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                    The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32.
+     * @param[in] output  Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     * @param[in] mean    Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] var     Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] beta    Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] gamma   Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] epsilon Small value to avoid division with zero.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                           const ITensorInfo *mean, const ITensorInfo *var,
+                           const ITensorInfo *beta, const ITensorInfo *gamma,
+                           float epsilon);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
index 1366adad3..29248f653 100644
--- a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
@@ -46,7 +46,7 @@ public:
     BorderSize border_size() const override;
 };
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** NEON kernel to perform a Box 3x3 filter using F16 simd
  */
 class NEBox3x3FP16Kernel : public NEBox3x3Kernel
@@ -55,8 +55,8 @@ public:
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 };
-#else  /* ARM_COMPUTE_ENABLE_FP16 */
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 using NEBox3x3FP16Kernel = NEBox3x3Kernel;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEBOX3x3KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
index 37d86685d..a57c3894b 100644
--- a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
+++ b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
@@ -81,7 +81,7 @@ protected:
     ITensor          *_phase;     /**< Destination tensor - Quantized phase */
 };
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** NEON kernel to perform Gradient computation
  */
 class NEGradientFP16Kernel : public NEGradientKernel
@@ -90,9 +90,9 @@ public:
     // Inherited methods overriden:
     void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type) override;
 };
-#else  /* ARM_COMPUTE_ENABLE_FP16 */
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 using NEGradientFP16Kernel = NEGradientKernel;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 /** NEON kernel to perform Non-Maxima suppression for Canny Edge.
  *
diff --git a/arm_compute/core/NEON/kernels/NECol2ImKernel.h b/arm_compute/core/NEON/kernels/NECol2ImKernel.h
index d537d49c5..243cc77a4 100644
--- a/arm_compute/core/NEON/kernels/NECol2ImKernel.h
+++ b/arm_compute/core/NEON/kernels/NECol2ImKernel.h
@@ -26,6 +26,8 @@
 
 #include "arm_compute/core/NEON/INEKernel.h"
 
+#include "arm_compute/core/Size2D.h"
+
 namespace arm_compute
 {
 class ITensor;
@@ -66,12 +68,22 @@ public:
 
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input          The input tensor to convert. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input          The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
      * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
      *                            while the rest represent batch of outputs. Data types supported: Same as @p input
      * @param[in]  convolved_dims Output convolved dimensions.
      */
-    void configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims);
+    void configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims);
+    /** Static function to check if given info will lead to a valid configuration of @ref NECol2ImKernel
+     *
+     * @param[in] input          The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
+     *                           while the rest represent batch of outputs. Data types supported: Same as @p input
+     * @param[in] convolved_dims Output convolved dimensions.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -93,7 +105,7 @@ private:
     Col2ImFunctionPtr _func;
     const ITensor    *_input;
     ITensor          *_output;
-    std::pair<unsigned int, unsigned int> _convolved_dims;
+    Size2D            _convolved_dims;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_NECOL2IMKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h b/arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h
new file mode 100644
index 000000000..707564683
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDECONVOLUTIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_NEDECONVOLUTIONLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform scaling on a tensor */
+class NEDeconvolutionLayerUpsampleKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDeconvolutionLayerUpsampleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDeconvolutionLayerUpsampleKernel(const NEDeconvolutionLayerUpsampleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDeconvolutionLayerUpsampleKernel &operator=(const NEDeconvolutionLayerUpsampleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDeconvolutionLayerUpsampleKernel(NEDeconvolutionLayerUpsampleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDeconvolutionLayerUpsampleKernel &operator=(NEDeconvolutionLayerUpsampleKernel &&) = default;
+    /** Default destructor */
+    ~NEDeconvolutionLayerUpsampleKernel() = default;
+
+    /** Initialise the kernel's inputs, output and interpolation policy
+     *
+     * @param[in]  input   Source tensor. Data types supported: F32.
+     * @param[in]  offsets Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
+     * @param[out] output  Destination tensor. Data types supported: F32. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     */
+    void configure(const ITensor *input, const ITensor *offsets, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Function to perform scale using nearest interpolation on the given window */
+    void scale_nearest(const Window &window);
+
+    const ITensor *_offsets;
+    const ITensor *_input;
+    ITensor       *_output;
+};
+} // arm_compute
+#endif /*__ARM_COMPUTE_NEDECONVOLUTIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
index 784dfc3f5..6029873f2 100644
--- a/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
@@ -34,21 +34,21 @@ class ITensor;
 /** Interface for the depth concatenate kernel.
  *  The input tensor will be concatenated into the output tensor.
  */
-class NEDepthConcatenateKernel : public INEKernel
+class NEDepthConcatenateLayerKernel : public INEKernel
 {
 public:
     /** Default constructor */
-    NEDepthConcatenateKernel();
+    NEDepthConcatenateLayerKernel();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthConcatenateKernel(const NEDepthConcatenateKernel &) = delete;
+    NEDepthConcatenateLayerKernel(const NEDepthConcatenateLayerKernel &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthConcatenateKernel &operator=(const NEDepthConcatenateKernel &) = delete;
+    NEDepthConcatenateLayerKernel &operator=(const NEDepthConcatenateLayerKernel &) = delete;
     /** Allow instances of this class to be moved */
-    NEDepthConcatenateKernel(NEDepthConcatenateKernel &&) = default;
+    NEDepthConcatenateLayerKernel(NEDepthConcatenateLayerKernel &&) = default;
     /** Allow instances of this class to be moved */
-    NEDepthConcatenateKernel &operator=(NEDepthConcatenateKernel &&) = default;
+    NEDepthConcatenateLayerKernel &operator=(NEDepthConcatenateLayerKernel &&) = default;
     /** Default destructor */
-    ~NEDepthConcatenateKernel() = default;
+    ~NEDepthConcatenateLayerKernel() = default;
     /** Initialise the kernel's inputs and output
      *
      * @param[in]     input        Input tensor. Data types supported: QS8/QS16/F16/F32.
diff --git a/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
index 332406f23..af51ded87 100644
--- a/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
@@ -34,19 +34,19 @@ namespace arm_compute
 class ITensor;
 
 /** Depth conversion kernel */
-class NEDepthConvertKernel : public INEKernel
+class NEDepthConvertLayerKernel : public INEKernel
 {
 public:
     /** Default constructor*/
-    NEDepthConvertKernel();
+    NEDepthConvertLayerKernel();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthConvertKernel(const NEDepthConvertKernel &) = delete;
+    NEDepthConvertLayerKernel(const NEDepthConvertLayerKernel &) = delete;
     /** Default move constructor */
-    NEDepthConvertKernel(NEDepthConvertKernel &&) = default;
+    NEDepthConvertLayerKernel(NEDepthConvertLayerKernel &&) = default;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthConvertKernel &operator=(const NEDepthConvertKernel &) = delete;
+    NEDepthConvertLayerKernel &operator=(const NEDepthConvertLayerKernel &) = delete;
     /** Default move assignment operator */
-    NEDepthConvertKernel &operator=(NEDepthConvertKernel &&) = default;
+    NEDepthConvertLayerKernel &operator=(NEDepthConvertLayerKernel &&) = default;
     /** Set the input and output of the kernel
      *
      * Valid conversions Input -> Output :
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
new file mode 100644
index 000000000..b8f01cb63
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H__
+#define __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor.
+ */
+class NEDepthwiseConvolutionLayer3x3Kernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDepthwiseConvolutionLayer3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseConvolutionLayer3x3Kernel(const NEDepthwiseConvolutionLayer3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseConvolutionLayer3x3Kernel &operator=(const NEDepthwiseConvolutionLayer3x3Kernel &) = delete;
+    /** Default Move Constructor. */
+    NEDepthwiseConvolutionLayer3x3Kernel(NEDepthwiseConvolutionLayer3x3Kernel &&) = default;
+    /** Default move assignment operator. */
+    NEDepthwiseConvolutionLayer3x3Kernel &operator=(NEDepthwiseConvolutionLayer3x3Kernel &&) = default;
+    /** Initialize the function's source, destination, conv and border_size.
+     *
+     * @param[in]  input     Source tensor. DataType supported: F32.
+     * @param[in]  weights   Weights tensor. This is a 3D tensor with dimensions [3, 3, IFM]. Data type supported: Same as @p input.
+     * @param[out] output    Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  conv_info Padding and stride information to use for the convolution.
+     */
+    void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
+
+private:
+    BorderSize     _border_size;
+    const ITensor *_input;
+    ITensor       *_output;
+    const ITensor *_weights;
+    PadStrideInfo  _conv_info;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H__ */
+\ No newline at end of file
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h
new file mode 100644
index 000000000..fde474d1f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHWISEIM2COLKERNEL_H__
+#define __ARM_COMPUTE_NEDEPTHWISEIM2COLKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Size2D.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the depthwise im2col reshape kernel.
+ *  This kernel reshape the input low 3 dimensions to a new 3D shape  where the output's first dimension is
+ *  the linear patch size (FILTER_WIDTH * FILTER_HEIGHT) and second dimension is number of patches in per image and third dimension unchanged .
+ **/
+class NEDepthwiseIm2ColKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDepthwiseIm2ColKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseIm2ColKernel(const NEDepthwiseIm2ColKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseIm2ColKernel &operator=(const NEDepthwiseIm2ColKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDepthwiseIm2ColKernel(NEDepthwiseIm2ColKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDepthwiseIm2ColKernel &operator=(NEDepthwiseIm2ColKernel &&) = default;
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F32
+     * @param[out] output      The output tensor. First 3 lower dimensions represent a transform of each 3D input,
+     *                         while every dimension above 3 represents a batch. Data types supported: Same as @p input
+     * @param[in]  kernel_dims The kernel dimensions (width and height).
+     * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  has_bias    Boolean that specifies if the depthwise convolution has bias.
+     */
+    void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias = false);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+    Size2D         _kernel_dims;
+    PadStrideInfo  _conv_info;
+    bool           _has_bias;
+};
+} // arm_compute
+#endif /*__ARM_COMPUTE_NEDEPTHWISEIM2COLKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h
new file mode 100644
index 000000000..8b33fae6f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHWISEVECTORTOTENSORKERNEL_H__
+#define __ARM_COMPUTE_NEDEPTHWISEVECTORTOTENSORKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the depthwise vector to tensor kernel.
+ *
+ *  This kernel takes the 1D tensor that's been produced by the MatrixVectorMultiply
+ *  kernel and reshapes it to given width and height (previously calculated, based
+ *  on input/weights dimensions and convolution strides and padding).
+ *
+ **/
+class NEDepthwiseVectorToTensorKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDepthwiseVectorToTensorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseVectorToTensorKernel(const NEDepthwiseVectorToTensorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseVectorToTensorKernel &operator=(const NEDepthwiseVectorToTensorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDepthwiseVectorToTensorKernel(NEDepthwiseVectorToTensorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDepthwiseVectorToTensorKernel &operator=(NEDepthwiseVectorToTensorKernel &&) = default;
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input  The input vector to convert. Data type supported: F32.
+     * @param[out] output The output tensor. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: same as @p input.
+     * @param[in]  conv_w The converted tensor's width.
+     * @param[in]  conv_h The converted tensor's height.
+     */
+    void configure(const ITensor *input, ITensor *output, size_t conv_w, size_t conv_h);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+    std::pair<size_t, size_t> _conv_dims;
+};
+} // arm_compute
+#endif /*__ARM_COMPUTE_NEDEPTHWISEVECTORTOTENSORKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h
new file mode 100644
index 000000000..2e986117d
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHWISEWEIGHTSRESHAPEKERNEL_H__
+#define __ARM_COMPUTE_NEDEPTHWISEWEIGHTSRESHAPEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the depthwise weights reshape kernel.
+ *  This kernel reshape original weights' low 2D dimensions into a single col and
+ *  have the second dimension as the original depth size.
+ **/
+class NEDepthwiseWeightsReshapeKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDepthwiseWeightsReshapeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseWeightsReshapeKernel(const NEDepthwiseWeightsReshapeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthwiseWeightsReshapeKernel &operator=(const NEDepthwiseWeightsReshapeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDepthwiseWeightsReshapeKernel(NEDepthwiseWeightsReshapeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDepthwiseWeightsReshapeKernel &operator=(NEDepthwiseWeightsReshapeKernel &&) = default;
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input  The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: F32.
+     * @param[out] output The output tensor. Data type supported: same as @p input.
+     * @param[in]  biases (Optional) The input biases to add. Shape [IFM]. Data type supported: same as @p input.
+     */
+    void configure(const ITensor *input, ITensor *output, const ITensor *biases);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+    const ITensor *_biases;
+};
+} // arm_compute
+#endif /*__ARM_COMPUTE_NEDEPTHWISEWEIGHTSRESHAPEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDerivativeKernel.h b/arm_compute/core/NEON/kernels/NEDerivativeKernel.h
index 7613b586d..5d46516f6 100644
--- a/arm_compute/core/NEON/kernels/NEDerivativeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDerivativeKernel.h
@@ -64,17 +64,17 @@ public:
 private:
     /** Function to perform derivative along the X direction on the given window
      *
-     *  @param[in] window Region on which to execute the kernel
+     * @param[in] window Region on which to execute the kernel
      */
     void derivative_x(const Window &window);
     /** Function to perform derivative along the Y direction on the given window
      *
-     *  @param[in] window Region on which to execute the kernel
+     * @param[in] window Region on which to execute the kernel
      */
     void derivative_y(const Window &window);
     /** Function to perform derivative along the X and Y direction on the given window
      *
-     *  @param[in] window Region on which to execute the kernel
+     * @param[in] window Region on which to execute the kernel
      */
     void derivative_xy(const Window &window);
     /** Common signature for all the specialised derivative functions
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
index 14c8e9c7e..05ade1c5d 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
@@ -57,6 +57,16 @@ public:
      *                         Data type supported: Same as @p input
      */
     void configure(ITensor *input, const ITensor *bias, ITensor *output = nullptr);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerBiasAccumulateKernel
+     *
+     * @param[in] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
+     *                   Data type supported: QS8/QS16/F16/F32
+     * @param[in] bias   The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
+     * @param[in] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
+     *                         Data type supported: Same as @p input
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
index 370ddca48..4529120f0 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
@@ -62,6 +62,20 @@ public:
      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
      */
     void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerKernel
+     *
+     * @param[in] input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
+     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32.
+     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
+     *                      Data type supported:Same as @p input.
+     * @param[in] output    Output tensor.
+     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
index e298bfdeb..9e0fe8059 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** AssemblyBase/armv7a NEON kernel to multiply two input matrices "A" and "B". */
+/** Base class for GEMM NEON kernels implemented in Assembly. */
 class NEGEMMAssemblyBaseKernel : public INEKernel
 {
 public:
diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
index 1c0d85c27..fd93def0c 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
@@ -56,10 +56,18 @@ public:
     NEGEMMInterleave4x4Kernel();
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QS16/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
      */
     void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleave4x4Kernel
+     *
+     * @param[in] input  Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32
+     * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -67,7 +75,7 @@ public:
 private:
     /** Common signature for all the transpose functions
      *
-     * @param[in]  input  An input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  An input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
      * @param[out] output The output tensor. Data type supported: same as @p input
      * @param[in]  window Region on which to execute the kernel.
      */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h
new file mode 100644
index 000000000..b9bb18d2b
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMINTERLEAVEBLOCKEDKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMINTERLEAVEBLOCKEDKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to interleave the elements of a matrix
+ *
+ * Interleave_Blocked copies a block of values at a time instead of just one.  The main use of this is the gemmlowp with the "dot product"
+ * instruction, where each operation consumes 4 values, so we need to copy blocks of 4 values.
+ *
+ */
+class NEGEMMInterleaveBlockedKernel : public INESimpleKernel
+{
+public:
+    /* Constructor */
+    NEGEMMInterleaveBlockedKernel();
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input        Input tensor. Data types supported: U8
+     * @param[out] output       Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
+     * @param[in]  block_height The height of the blocks to be interleaved.
+     * @param[in]  block_width  The width of the blocks to be interleaved.
+     * @param[in]  transpose    True if transpose operation must be performed, false otherwise.
+     */
+    void configure(const ITensor *input, ITensor *output, unsigned int block_height, unsigned int block_width, bool transpose);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleaveBlockedKernel
+     *
+     * @param[in] input        Input tensor. Data types supported: U8
+     * @param[in] output       Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
+     * @param[in] block_height The height of the blocks to be interleaved.
+     * @param[in] block_width  The width of the blocks to be interleaved.
+     * @param[in] transpose    True if transpose operation must be performed, false otherwise.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_height, unsigned int block_width, bool transpose);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    unsigned int _block_height;
+    unsigned int _block_width;
+    bool         _transpose;
+};
+
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEGEMMINTERLEAVEBLOCKEDKERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
index f526d213c..7435994b8 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
@@ -35,12 +35,9 @@ class ITensor;
  * @note @ref NEGEMMLowpMatrixMultiplyKernel low precision matrix product kernel
  *  This kernel performs the following computation:
  *
- *  -# Convert a values from uint8 to int32 and add a_offset to each of them.
- *  -# Convert b values from uint8 to int32 and add b_offset to each of them.
- *  -# Compute the int32 matrix product of the resulting a * b.
- *  -# Add output_offset to each entry of the result.
- *  -# Multiply each entry of the result and round to the nearest integer
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
+ *  -# Convert a values from int8 to int32
+ *  -# Convert b values from int8 to int32
+ *  -# Compute the int32 matrix product of the resulting a * b and store the result as int32
  *
  */
 class NEGEMMLowpMatrixMultiplyKernel : public INEKernel
@@ -61,16 +58,21 @@ public:
      * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two
      * kernels change the layout of the original matrices to be more cache-friendly.
      *
-     * @param[in]  input0          Input tensor containing the interleaved Matrix A. Data type supported: U8
-     * @param[in]  input1          Input tensor containing the transposed Matrix B. Data type supported: same as @p input0
-     * @param[out] output          Output tensor to store the result of matrix multiplication, Data type supported: same as @p input0
-     * @param[in]  a_offset        Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset        Offset to be added to each element of the matrix B.
-     * @param[in]  output_offset   Offset to be added to each element of the output matrix
-     * @param[in]  output_mult_int Value to be multipied to each entry of the result.
-     * @param[in]  shift           Number of bits to shift right the result.
+     * @param[in]  input0 Input tensor containing the interleaved Matrix A. Data type supported: QASYMM8
+     * @param[in]  input1 Input tensor containing the transposed1xW Matrix B. Data type supported: same as @p input0
+     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
      */
-    void configure(const ITensor *input0, const ITensor *input1, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
+    void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyKernel
+     *
+     * @param[in] input0 Input tensor info containing the interleaved Matrix A. Data type supported: QASYMM8
+     * @param[in] input1 Input tensor info containing the transposed Matrix B. Data type supported: same as @p input0
+     * @param[in] output Output tensor info to store the result of matrix multiplication. Data type supported: S32
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
+
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
@@ -78,11 +80,7 @@ private:
     const ITensor *_input0;
     const ITensor *_input1;
     ITensor       *_output;
-    int32_t        _a_offset;
-    int32_t        _b_offset;
-    int32_t        _output_offset;
-    int32_t        _output_mult_int;
-    int32_t        _shift;
+    bool           _slide_matrix_b;
 };
 } // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__*/
-\ No newline at end of file
+#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
new file mode 100644
index 000000000..531968304
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (vector_sum_col[k] * a_offset) +
+ *                   (vector_sum_row[i] * b_offset) +
+ *                   (a_offset * b_offset * k)
+ *
+ */
+class NEGEMMLowpOffsetContributionKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    NEGEMMLowpOffsetContributionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpOffsetContributionKernel(const NEGEMMLowpOffsetContributionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpOffsetContributionKernel &operator=(const NEGEMMLowpOffsetContributionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpOffsetContributionKernel(NEGEMMLowpOffsetContributionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpOffsetContributionKernel &operator=(NEGEMMLowpOffsetContributionKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in, out] mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      k              Number of matrix A columns or Matrix B rows
+     * @param[in]      a_offset       Offset to be added to each element of the matrix A.
+     * @param[in]      b_offset       Offset to be added to each element of the matrix B.
+     */
+    void configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpOffsetContributionKernel
+     *
+     * @param[in] mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
+     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
+     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in] a_offset       Offset to be added to each element of the matrix A.
+     * @param[in] b_offset       Offset to be added to each element of the matrix B.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_vector_sum_col;
+    const ITensor *_vector_sum_row;
+    ITensor       *_mm_result;
+    int32_t        _a_offset;
+    int32_t        _b_offset;
+    int32_t        _k_offset;
+    bool           _slide_vector_sum_col;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
new file mode 100644
index 000000000..b1dd1fb2d
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ */
+class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input                        Input tensor. Data type supported: S32
+     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
+     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
+     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     */
+    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+     *
+     * @param[in] input  Input tensor. Data type supported: S32
+     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Template function to run the NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <bool is_bounded_relu>
+    void run(const Window &window);
+
+    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)(const Window &window);
+
+    QuantizeDownFunctionPtr _func;
+    const ITensor          *_input;
+    const ITensor          *_bias;
+    ITensor                *_output;
+    int                     _result_fixedpoint_multiplier;
+    int                     _result_shift;
+    int                     _result_offset_after_shift;
+    int                     _min;
+    int                     _max;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
new file mode 100644
index 000000000..10b333032
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ */
+class NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input           Input tensor. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in]  result_offset   Offset to be added to each element of the input matrix
+     * @param[in]  result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift    Number of bits to shift right the result before converting back to QASYMM8
+     * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     */
+    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel
+     *
+     * @param[in] input  Input tensor. Data type supported: S32
+     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Template function to run the NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <bool is_bounded_relu>
+    void run(const Window &window);
+
+    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::*)(const Window &window);
+
+    QuantizeDownFunctionPtr _func;
+    const ITensor          *_input;
+    const ITensor          *_bias;
+    ITensor                *_output;
+    int                     _result_offset;
+    int                     _result_mult_int;
+    int                     _result_shift;
+    int                     _min;
+    int                     _max;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
new file mode 100644
index 000000000..38c353e29
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Common interface for all NEON reduction kernels */
+class INEGEMMLowpReductionKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    INEGEMMLowpReductionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    INEGEMMLowpReductionKernel(const INEGEMMLowpReductionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    INEGEMMLowpReductionKernel &operator=(const INEGEMMLowpReductionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    INEGEMMLowpReductionKernel(INEGEMMLowpReductionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    INEGEMMLowpReductionKernel &operator=(INEGEMMLowpReductionKernel &&) = default;
+
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input       Input tensor. Data type supported: QASYMM8
+     * @param[out] output      Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
+     * @param[in]  k           Number of matrix A columns (or matrix B rows)
+     * @param[in]  is_reshaped True if the input tensor has been reshaped
+     */
+    virtual void configure(const ITensor *input, ITensor *output, int32_t k, bool is_reshaped) = 0;
+
+protected:
+    const ITensor *_input;
+    ITensor       *_output;
+    int32_t        _k;
+    bool           _is_reshaped;
+};
+
+/** NEON kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class NEGEMMLowpMatrixAReductionKernel : public INEGEMMLowpReductionKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  mtx_a             Input tensor. Data type supported: QASYMM8
+     * @param[out] vector_sum_row    Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+     * @param[in]  num_mtx_a_cols    Number of matrix A columns
+     * @param[in]  is_interleaved4x4 True if the matrix A has been interleaved4x4
+     */
+    void configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) override;
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixAReductionKernel
+     *
+     * @param[in] mtx_a             Input tensor. Data type supported: QASYMM8
+     * @param[in] vector_sum_row    Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+     * @param[in] num_mtx_a_cols    Number of matrix A columns
+     * @param[in] is_interleaved4x4 True if the matrix A has been interleaved4x4
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+};
+
+/** NEON kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class NEGEMMLowpMatrixBReductionKernel : public INEGEMMLowpReductionKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  mtx_b            Input tensor. Data type supported: Data type supported: QASYMM8
+     * @param[out] vector_sum_col   Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+     * @param[in]  num_mtx_b_rows   Number of matrix B rows
+     * @param[in]  is_transposed1xW True if the input tensor is transposed 1xW
+     */
+    void configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) override;
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixBReductionKernel
+     *
+     * @param[in] mtx_b            Input tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in] vector_sum_col   Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+     * @param[in] num_mtx_b_rows   Number of matrix B rows
+     * @param[in] is_transposed1xW True if the input tensor is transposed 1xW
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
new file mode 100644
index 000000000..d844af5d5
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_
+#define __ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+class NEGEMMMatrixVectorMultiplyKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEGEMMMatrixVectorMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMMatrixVectorMultiplyKernel(const NEGEMMMatrixVectorMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMMatrixVectorMultiplyKernel &operator=(const NEGEMMMatrixVectorMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixVectorMultiplyKernel(NEGEMMMatrixVectorMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixVectorMultiplyKernel &operator=(NEGEMMMatrixVectorMultiplyKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input0 First Input tensor. Data types supported: F16/F32
+     * @param[in]  input1 Second Input tensor. Data types supported: same as @p input.
+     * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
+     */
+    void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input0;
+    const ITensor *_input1;
+    ITensor       *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
index 4d0bb2a48..e8ee2a7d2 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
@@ -70,10 +70,18 @@ class NEGEMMTranspose1xWKernel : public INESimpleKernel
 public:
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: same as @p input.
      */
     void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xWKernel
+     *
+     * @param[in] input  Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] output Output tensor info. Data type supported: same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h
index 31779b520..d28501107 100644
--- a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h
@@ -49,19 +49,17 @@ public:
 
     /** Initialise the kernel's source, destination and border mode.
      *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     * @param[in]  input  Source tensor. Data type supported: U8.
+     * @param[out] output Destination tensor. Data type supported: S16.
      */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+    void configure(const ITensor *input, ITensor *output);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
     BorderSize border_size() const override;
 
 private:
-    BorderSize _border_size;
-    int        _l2_load_offset;
+    int _l2_load_offset;
 };
 
 /** NEON kernel to perform a GaussianPyramid (vertical pass) */
@@ -83,11 +81,10 @@ public:
 
     /** Initialise the kernel's source, destination and border mode.
      *
-     * @param[in]  input            Source tensor. Data type supported: S16.
-     * @param[out] output           Destination tensor. Data type supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     * @param[in]  input  Source tensor. Data type supported: S16.
+     * @param[out] output Destination tensor. Data type supported: U8.
      */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+    void configure(const ITensor *input, ITensor *output);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
index 2aef420e4..c3c37e4d2 100644
--- a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
+++ b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
@@ -99,7 +99,7 @@ private:
     HarrisScoreFunction *_func;
 };
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** Interface for the accumulate Weighted kernel using F16 */
 template <int32_t block_size>
 class NEHarrisScoreFP16Kernel : public INEHarrisScoreKernel
@@ -118,9 +118,9 @@ private:
     /** Harris Score function to use for the particular image types passed to configure() */
     HarrisScoreFunction *_func;
 };
-#else  /* ARM_COMPUTE_ENABLE_FP16 */
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 template <int32_t block_size>
 using NEHarrisScoreFP16Kernel = NEHarrisScoreKernel<block_size>;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEHARRISCORNERSKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEHistogramKernel.h b/arm_compute/core/NEON/kernels/NEHistogramKernel.h
index 0fa911dbf..672472e08 100644
--- a/arm_compute/core/NEON/kernels/NEHistogramKernel.h
+++ b/arm_compute/core/NEON/kernels/NEHistogramKernel.h
@@ -82,28 +82,28 @@ public:
 private:
     /** Function to merge multiple partial histograms.
      *
-     *  @param[out] global_hist Pointer to the final histogram.
-     *  @param[in]  local_hist  Pointer to the partial histograms.
-     *  @param[in]  bins        Number of bins.
+     * @param[out] global_hist Pointer to the final histogram.
+     * @param[in]  local_hist  Pointer to the partial histograms.
+     * @param[in]  bins        Number of bins.
      */
     void merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins);
     /** Function to merge multiple minimum values of partial histograms.
      *
-     *  @param[out] global_min Pointer to the global min value.
-     *  @param[in]  local_min  Local min value.
+     * @param[out] global_min Pointer to the global min value.
+     * @param[in]  local_min  Local min value.
      */
     void merge_min(uint8_t *global_min, const uint8_t &local_min);
     /** Function to perform histogram on the given window
-      *
-     *  @param[in] win  Region on which to execute the kernel
-     *  @param[in] info Info about the executing thread
+     *
+     * @param[in] win  Region on which to execute the kernel
+     * @param[in] info Info about the executing thread
      */
     void histogram_U8(Window win, const ThreadInfo &info);
     /** Function to perform histogram on the given window where histogram is
      *         of fixed size 256 without ranges and offsets.
      *
-     *  @param[in] win  Region on which to execute the kernel
-     *  @param[in] info Info about the executing thread
+     * @param[in] win  Region on which to execute the kernel
+     * @param[in] info Info about the executing thread
      */
     void histogram_fixed_U8(Window win, const ThreadInfo &info);
     /** Pre-calculate the pixel windowing for every possible pixel
diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
index 1a0735ea8..bc12b22e5 100644
--- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
+++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
@@ -73,13 +73,27 @@ public:
     /** Set the input and output of the kernel.
      *
      * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32
+     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32
+     *                         Note: QASYMM8 works only for has_bias = false
      * @param[out] output      The output tensor. Data types supported: Same as @p input
      * @param[in]  kernel_dims The kernel dimensions (width and height).
      * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
      */
     void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEIm2ColKernel
+     *
+     * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                        while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32
+     *                        Note: QASYMM8 works only for has_bias = false
+     * @param[in] output      The output tensor. Data types supported: Same as @p input
+     * @param[in] kernel_dims The kernel dimensions (width and height).
+     * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in] has_bias    In case biases are provided expands the matrix with 1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NEL2NormalizeKernel.h b/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h
index fbbe4bee9..7aa5116b6 100644
--- a/arm_compute/core/NEON/kernels/NEL2NormalizeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h
@@ -31,21 +31,21 @@ namespace arm_compute
 class ITensor;
 
 /** Interface for performing a L2 normalize on a given axis given the square sum of it in this axis */
-class NEL2NormalizeKernel : public INEKernel
+class NEL2NormalizeLayerKernel : public INEKernel
 {
 public:
     /** Default constructor */
-    NEL2NormalizeKernel();
+    NEL2NormalizeLayerKernel();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEL2NormalizeKernel(const NEL2NormalizeKernel &) = delete;
+    NEL2NormalizeLayerKernel(const NEL2NormalizeLayerKernel &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEL2NormalizeKernel &operator=(const NEL2NormalizeKernel &) = delete;
+    NEL2NormalizeLayerKernel &operator=(const NEL2NormalizeLayerKernel &) = delete;
     /** Allow instances of this class to be moved */
-    NEL2NormalizeKernel(NEL2NormalizeKernel &&) = default;
+    NEL2NormalizeLayerKernel(NEL2NormalizeLayerKernel &&) = default;
     /** Allow instances of this class to be moved */
-    NEL2NormalizeKernel &operator=(NEL2NormalizeKernel &&) = default;
+    NEL2NormalizeLayerKernel &operator=(NEL2NormalizeLayerKernel &&) = default;
     /** Default destructor */
-    ~NEL2NormalizeKernel() = default;
+    ~NEL2NormalizeLayerKernel() = default;
     /** Set the input and output tensors.
      *
      * @param[in]  input   Source tensor. Data types supported: F32.
diff --git a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
index b853d2245..76c616360 100644
--- a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
+++ b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
@@ -66,17 +66,17 @@ public:
 private:
     /** Function to perform magnitude on the given window
      *
-     *  @param[in] window Region on which to execute the kernel
+     * @param[in] window Region on which to execute the kernel
      */
     void magnitude(const Window &window);
     /** Function to perform phase on the given window
      *
-     *  @param[in] window Region on which to execute the kernel
+     * @param[in] window Region on which to execute the kernel
      */
     void phase(const Window &window);
     /** Function to perform magnitude and phase on the given window
      *
-     *  @param[in] window Region on which to execute the kernel
+     * @param[in] window Region on which to execute the kernel
      */
     void magnitude_phase(const Window &window);
 
@@ -94,7 +94,7 @@ private:
     ITensor                  *_phase;     /**< Output - Phase */
 };
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** Template interface for the kernel to compute magnitude and phase */
 template <MagnitudeType mag_type, PhaseType phase_type>
 class NEMagnitudePhaseFP16Kernel : public INEKernel
@@ -130,17 +130,17 @@ public:
 private:
     /** Function to perform magnitude on the given window
      *
-     *  @param[in] window Region on which to execute the kernel
+     * @param[in] window Region on which to execute the kernel
      */
     void magnitude(const Window &window);
     /** Function to perform phase on the given window
      *
-     *  @param[in] window Region on which to execute the kernel
+     * @param[in] window Region on which to execute the kernel
      */
     void phase(const Window &window);
     /** Function to perform magnitude and phase on the given window
      *
-     *  @param[in] window Region on which to execute the kernel
+     * @param[in] window Region on which to execute the kernel
      */
     void magnitude_phase(const Window &window);
 
@@ -156,9 +156,9 @@ private:
     ITensor                  *_magnitude; /**< Output - Magnitude */
     ITensor                  *_phase;     /**< Output - Phase */
 };
-#else  /* ARM_COMPUTE_ENABLE_FP16 */
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 template <MagnitudeType mag_type, PhaseType phase_type>
 using NEMagnitudePhaseFP16Kernel = NEMagnitudePhaseKernel<mag_type, phase_type>;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
index 3bce1a99f..da8aecff5 100644
--- a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
@@ -78,7 +78,7 @@ protected:
     ITensor                *_output; /**< Destination tensor */
 };
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in F16 if the input data type is F32
  */
 class NENonMaximaSuppression3x3FP16Kernel : public NENonMaximaSuppression3x3Kernel
@@ -92,8 +92,8 @@ public:
      */
     void configure(const ITensor *input, ITensor *output, bool border_undefined);
 };
-#else  /* ARM_COMPUTE_ENABLE_FP16 */
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 using NENonMaximaSuppression3x3FP16Kernel = NENonMaximaSuppression3x3Kernel;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace arm_compute
 #endif /* _ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
index 40fae3520..405daf106 100644
--- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -57,6 +57,18 @@ public:
      * @param[in]  norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
      */
     void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel
+     *
+     * @param[in] input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                          and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/FP16/F32.
+     * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                          Data type supported: same as @p input
+     * @param[in] output        Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     * @param[in] norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, NormalizationLayerInfo norm_info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
index baa4112ca..10f990e7e 100644
--- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
@@ -62,6 +62,23 @@ public:
      * @param[in]  rounding_policy Rounding policy.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplicationKernel
+     *
+     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+     *       For QS8/QS16 scale = 1 is the only supported value.
+     *
+     * @param[in] input1          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
+     * @param[in] input2          An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     * @param[in] output          The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
+     * @param[in] scale           Scale to apply after multiplication.
+     *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in] overflow_policy Overflow policy.
+     * @param[in] rounding_policy Rounding policy.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
index 9d7c75179..87d14e5f9 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -55,6 +55,17 @@ public:
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
     void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEPoolingLayerKernel
+     *
+     * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only
+     *
+     * @param[in] input     Source tensor. Data types supported: QS8/QS16/F16/F32.
+     * @param[in] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -66,14 +77,14 @@ private:
      * @param[in] window_input Input region on which to execute the kernel.
      * @param[in] window       Output region on which to execute the kernel.
      */
-    template <PoolingType pooling_type>
+    template <PoolingType pooling_type, bool exclude_padding = false>
     void pooling2_f32(const Window &window_input, const Window &window);
     /** Function to perform 2x2 pooling for float16_t.
      *
      * @param[in] window_input Input region on which to execute the kernel.
      * @param[in] window       Output region on which to execute the kernel.
      */
-    template <PoolingType pooling_type>
+    template <PoolingType pooling_type, bool exclude_padding = false>
     void pooling2_f16(const Window &window_input, const Window &window);
 
     /** Function to perform 2x2 pooling for 8bit fixed point.
@@ -95,14 +106,14 @@ private:
      * @param[in] window_input Input region on which to execute the kernel.
      * @param[in] window       Output region on which to execute the kernel.
      */
-    template <PoolingType pooling_type>
+    template <PoolingType pooling_type, bool exclude_padding = false>
     void pooling3_f32(const Window &window_input, const Window &window);
     /** Function to perform 3x3 pooling.
      *
      * @param[in] window_input Input region on which to execute the kernel.
      * @param[in] window       Output region on which to execute the kernel.
      */
-    template <PoolingType pooling_type>
+    template <PoolingType pooling_type, bool exclude_padding = false>
     void pooling3_f16(const Window &window_input, const Window &window);
     /** Function to perform 3x3 pooling for 8bit fixed point.
      *
@@ -123,14 +134,14 @@ private:
      * @param[in] window_input Input region on which to execute the kernel.
      * @param[in] window       Output region on which to execute the kernel.
      */
-    template <PoolingType pooling_type>
+    template <PoolingType pooling_type, bool exclude_padding = false>
     void pooling7_f32(const Window &window_input, const Window &window);
     /** Function to perform NxN pooling.
      *
      * @param[in] window_input Input region on which to execute the kernel.
      * @param[in] window       Output region on which to execute the kernel.
      */
-    template <PoolingType pooling_type>
+    template <PoolingType pooling_type, bool exclude_padding = false>
     void poolingN_f32(const Window &window_input, const Window &window);
     /** Common signature for all the specialised Pooling functions
      *
@@ -144,7 +155,7 @@ private:
     const ITensor   *_input;
     ITensor         *_output;
     PoolingLayerInfo _pool_info;
-    int              _num_elems_processed_per_iteration;
+    unsigned int     _num_elems_processed_per_iteration;
     BorderSize       _border_size;
 };
 } // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NERemapKernel.h b/arm_compute/core/NEON/kernels/NERemapKernel.h
index 5806275ce..7aa5de7a3 100644
--- a/arm_compute/core/NEON/kernels/NERemapKernel.h
+++ b/arm_compute/core/NEON/kernels/NERemapKernel.h
@@ -60,6 +60,7 @@ public:
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
+    BorderSize border_size() const override;
 
 private:
     /** function to perform nearest interpolation on the given window */
diff --git a/arm_compute/core/NEON/kernels/NEScaleKernel.h b/arm_compute/core/NEON/kernels/NEScaleKernel.h
index 5ec585484..ac154d445 100644
--- a/arm_compute/core/NEON/kernels/NEScaleKernel.h
+++ b/arm_compute/core/NEON/kernels/NEScaleKernel.h
@@ -59,8 +59,10 @@ public:
      * @param[out] output           Destination tensor. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  policy           Interpolation type to use
      * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     * @param[in]  sampling_policy  (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
      */
-    void configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined);
+    void configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined,
+                   SamplingPolicy sampling_policy = SamplingPolicy::CENTER);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
index cce21569d..0fecfac15 100644
--- a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
@@ -43,6 +43,14 @@ public:
      * @param[out] output Destination tensor. Data types supported: same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DMaxKernel
+     *
+     * @param[in] input  Source tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in] output Destination tensor. Data types supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -78,14 +86,26 @@ public:
      * @param[in]  max    Max values tensor. Data types supported: same as @p input.
      * @param[out] output Destination tensor. Data types supported: same as @p input.
      * @param[out] sum    Sum of 1D logits tensor. Data types supported: same as @p input.
+     * @param[in]  beta   (Optional) A scaling factor for the exponent. QS8/QS16 only support a beta value of 1.
      */
-    void configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum);
+    void configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum, float beta = 1.0f);
+    /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DShiftExpSumKernel
+     *
+     * @param[in] input  Source tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in] max    Max values tensor. Data types supported: same as @p input
+     * @param[in] output Destination tensor. Data types supported: same as @p input.
+     * @param[in] sum    Sum of 1D logits tensor. Data types supported: same as @p input.
+     * @param[in] beta   (Optional) A scaling factor for the exponent. QS8/QS16 only support a beta value of 1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum, float beta = 1.0f);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    using Logits1DShiftExpSumFunction = void(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window);
+    using Logits1DShiftExpSumFunction = void(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta);
 
 private:
     Logits1DShiftExpSumFunction *_func;
@@ -93,6 +113,7 @@ private:
     const ITensor               *_max;
     ITensor                     *_output;
     ITensor                     *_sum;
+    float                        _beta;
 };
 
 /** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
@@ -118,6 +139,15 @@ public:
      * @param[out] output Destination tensor. Data types supported: same as @p input.
      */
     void configure(const ITensor *input, const ITensor *sum, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DNormKernel
+     *
+     * @param[in] input  Source tensor. Data types supported: QS8/QS16/S32/F16/F32
+     * @param[in] sum    Sum tensor. The number of dimensions should be dim(input)-1. Data types supported: same as @p input.
+     * @param[in] output Destination tensor. Data types supported: same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NETransposeKernel.h b/arm_compute/core/NEON/kernels/NETransposeKernel.h
index 71bd27437..855d270e4 100644
--- a/arm_compute/core/NEON/kernels/NETransposeKernel.h
+++ b/arm_compute/core/NEON/kernels/NETransposeKernel.h
@@ -53,10 +53,18 @@ public:
 
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NETransposeKernel
+     *
+     * @param[in] input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] output Output tensor. Data type supported: Same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NEWarpKernel.h b/arm_compute/core/NEON/kernels/NEWarpKernel.h
index 3a1cab158..d7cb82f27 100644
--- a/arm_compute/core/NEON/kernels/NEWarpKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWarpKernel.h
@@ -66,17 +66,17 @@ public:
 protected:
     /** function to perform warp affine or warp perspective on the given window when border mode == UNDEFINED
      *
-     *  @param[in] window Region on which to execute the kernel
+     * @param[in] window Region on which to execute the kernel
      */
     virtual void warp_undefined(const Window &window) = 0;
     /** function to perform warp affine or warp perspective on the given window when border mode == CONSTANT
      *
-     *  @param[in] window Region on which to execute the kernel
+     * @param[in] window Region on which to execute the kernel
      */
     virtual void warp_constant(const Window &window) = 0;
     /** function to perform warp affine or warp perspective on the given window when border mode == REPLICATE
      *
-     *  @param[in] window Region on which to execute the kernel
+     * @param[in] window Region on which to execute the kernel
      */
     virtual void warp_replicate(const Window &window) = 0;
     /** Common signature for all the specialised warp functions
diff --git a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h
new file mode 100644
index 000000000..c1343044a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMWINOGRADLAYERKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMWINOGRADLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/kernels/winograd/tensor.hpp"
+
+namespace arm_compute
+{
+class ITensor;
+class NEWinogradLayerKernel;
+class Winograd3x3F32
+{
+public:
+    friend class NEWinogradLayerKernel;
+    Winograd3x3F32(const KernelShape &kernel_shape, const Tensor4DShape input_shape, const PaddingType padding_type, void *kernel_storage);
+    ~Winograd3x3F32();
+    std::pair<void *, void *> get_nhwc_ptrs(const Tensor4DShape &input_shape, const PaddingType padding_type, void *working_space);
+    void transform_weights(const void *const kernel, void *transform_working_space);
+    void reshape_input(const Tensor4DShape &input_shape, const PaddingType padding_type, const void *const input, void *working_space);
+    void reshape_output(const Tensor4DShape &input_shape, const PaddingType padding_type, void *const output);
+    void nchw2nhwc(const Tensor4DShape &input_shape, const PaddingType padding_type, void *working_space, const void *const input);
+    void nhwc2nchw(const Tensor4DShape &input_shape, const PaddingType padding_type, void *working_space, void *const output);
+
+private:
+    class Private;
+    std::unique_ptr<Private> _pimpl;
+};
+
+class NEWinogradLayerKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    NEWinogradLayerKernel();
+
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWinogradLayerKernel(const NEWinogradLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWinogradLayerKernel &operator=(const NEWinogradLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEWinogradLayerKernel(NEWinogradLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEWinogradLayerKernel &operator=(NEWinogradLayerKernel &&) = default;
+
+    virtual ~NEWinogradLayerKernel() = default;
+
+    /** Initialise the kernel
+     *
+     * @param[in,out] output    Output tensor to store the result of matrix multiplication. Data type supported: F32.
+     * @param[in]     convolver A pointer to the winograd convolver, this object must have been configured and is ready to execute 16 GEMMS .
+     */
+    void configure(ITensor *output, Winograd3x3F32 *convolver);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+    /* Get the memory required to instantiate a new Winograd operator.
+       */
+    static size_t get_kernel_storage_size(const KernelShape &shape);
+
+    /* Get the memory required to apply a Winograd operator to some input.
+       */
+    static size_t get_working_space_size(const Tensor4DShape &input_shape, const KernelShape &k_shape, const PaddingType padding);
+
+    /* Get the memory required to transform the kernel.
+       */
+    static size_t get_kernel_transform_working_size(const KernelShape &shape);
+
+protected:
+    Winograd3x3F32 *_convolver;
+    ITensor        *_output;
+};
+
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEGEMMWINOGRADLAYERKERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h
new file mode 100644
index 000000000..33cd2d42d
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__
+#define __ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__
+
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+
+// Enable only if compiled for AArch64-V8A targets
+#ifdef ARM_COMPUTE_AARCH64_V8A
+
+namespace arm_compute
+{
+class ITensor;
+
+/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */
+class NEGEMMLowpAArch64A53Kernel : public NEGEMMAssemblyBaseKernel
+{
+public:
+    /** Default constructor */
+    NEGEMMLowpAArch64A53Kernel();
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+protected:
+    void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override;
+
+private:
+    using NEGEMMLowpAArch64A53 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window,
+                                      const ThreadInfo &info);
+    NEGEMMLowpAArch64A53 *_func;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_AARCH64_V8A */
+#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h
new file mode 100644
index 000000000..a93df033d
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__
+#define __ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__
+
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+
+// Enable only if compiled for AArch64-V8A targets
+#ifdef ARM_COMPUTE_AARCH64_V8A
+
+namespace arm_compute
+{
+class ITensor;
+
+/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */
+class NEGEMMLowpAArch64Kernel : public NEGEMMAssemblyBaseKernel
+{
+public:
+    /** Default constructor */
+    NEGEMMLowpAArch64Kernel();
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+protected:
+    void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override;
+
+private:
+    using NEGEMMLowpAArch64 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window,
+                                   const ThreadInfo &info);
+    NEGEMMLowpAArch64 *_func;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_AARCH64_V8A */
+#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h
new file mode 100644
index 000000000..b03e5fa1a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__
+#define __ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__
+
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+
+// Enable only if compiled for AArch64-V8.2-A targets
+#ifdef ARM_COMPUTE_AARCH64_V8_2
+
+namespace arm_compute
+{
+class ITensor;
+
+/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */
+class NEGEMMLowpAArch64V8P4Kernel : public NEGEMMAssemblyBaseKernel
+{
+public:
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMAssemblyBaseKernel
+     *
+     * The computed function is C = a * AxB + b * C.
+     *
+     * @param[in] input0 Input tensor info containing the Matrix A. Data types supported: QASYMM8
+     * @param[in] input1 Input tensor info containing the Matrix B. Data types supported: same as @p input0
+     * @param[in] output Output tensor info to store the result of matrix multiplication.
+     *                        If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: S32
+     */
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
+
+protected:
+    void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h
new file mode 100644
index 000000000..9480a6a5d
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__
+#define __ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__
+
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */
+class NEHGEMMAArch64FP16Kernel : public NEGEMMAssemblyBaseKernel
+{
+public:
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+protected:
+    void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
index 00974436f..ef89e3aac 100644
--- a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
@@ -28,6 +28,6 @@ template<typename To, typename Tr>
 class GemmCommon {
 public:
     virtual size_t get_working_size() const = 0;
-    virtual void execute(const To *, const int, const To *, const int, Tr *, const int, const Tr, const Tr, void *working_space = NULL) const = 0;
+    virtual void execute(const To *, const int, const To *, const int, Tr *, const int, const Tr, const Tr, void *working_space) const = 0;
     virtual ~GemmCommon() { }
 };
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
index f7d3a94fa..659ef837f 100644
--- a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
@@ -24,6 +24,7 @@
 #pragma once
 
 #include <stdio.h>
+#include <cassert>
 
 #include "gemm_common.hpp"
 #include "profiler.hpp"
@@ -114,12 +115,13 @@ public:
         // Work out the rounded size of M - needed for some buffers.
         Mround = (M + (strat.out_height - 1)) / strat.out_height;
         Mround *= strat.out_height;
+
     }
 
     // Actually execute the GEMM.
     void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override {
+        assert(working_space);
         profiler prof;
-
         int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
         intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space_bytes);
         size_t diff = 0;
@@ -140,7 +142,7 @@ public:
             int kern_k = ((kmax - k0) + (strat.k_unroll - 1)) / strat.k_unroll;
             kern_k *= strat.k_unroll;
 
-            prof(PROFILE_PREPA, [&](void) {
+            prof(PROFILE_PREPA, (M * (kmax-k0) * sizeof(Toi)), [&](void) {
                 if (trA ^ strategy::A_transpose) {
                     Transform<strategy::A_interleave, strategy::A_block, true>(a_panel, A, lda, 0, M, k0, kmax);
                 } else {
@@ -154,7 +156,7 @@ public:
 
                 int bblocks = (xmax - x0 + strat.out_width - 1) / strat.out_width;
 
-                prof(PROFILE_PREPB, [&](void) {
+                prof(PROFILE_PREPB, (xmax-x0) * (kmax-k0) * sizeof(Toi), [&](void) {
                     if (trB ^ strategy::B_transpose) {
                         Transform<strategy::B_interleave, strategy::B_block, true>(b_panel, B, ldb, x0, xmax, k0, kmax);
                     } else {
@@ -166,8 +168,8 @@ public:
                     unsigned int ymax = y + strat.out_height;
                     if (ymax > M) ymax = M;
 
-                    prof(PROFILE_KERNEL, [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); });
-                    prof(PROFILE_MERGE, [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); });
+                    prof(PROFILE_KERNEL, (strat.out_height * bblocks * strat.out_width * kern_k), [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); });
+                    prof(PROFILE_MERGE, (strat.out_height * bblocks * strat.out_width * sizeof(Tr)), [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); });
                 }
             }
         }
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp
new file mode 100644
index 000000000..f7659b9a6
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+// Actual kernel implementations
+#include "a64_gemm_s16_12x8/generic.hpp"
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics.  The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class gemm_s16_12x8 {
+public:
+    typedef int16_t operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)(const int16_t *, const int16_t *, int32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int A_interleave = 8;
+    static const int A_block = 1;
+    static const int A_transpose = 0;
+
+    /* Same for B input */
+    static const int B_interleave = 12;
+    static const int B_block = 1;
+    static const int B_transpose = 1;
+
+    /* Kernel blocking parameters */
+    static const int out_width = 12;
+    static const int out_height = 8;
+    static const int k_unroll = 1;
+
+    kern_type kernel = nullptr;
+
+    gemm_s16_12x8(const CPUInfo *ci) {
+        kernel = a64_gemm_s16_asimd_12x8;
+    }
+};
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp
new file mode 100644
index 000000000..10259b2fd
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#include <arm_neon.h>
+
+inline void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+  const int16_t *a_ptr = Apanel;
+  int32_t *c_ptr = Cpanel;
+  for (int yb = 0; yb < ablocks; yb++)
+  {
+    const int16_t *a_ptr0 = a_ptr;
+    const int16_t *b_ptr = Bpanel;
+
+    for (int xb = 0; xb < bblocks; xb++)
+    {
+      a_ptr = a_ptr0;
+      const bool odd_k = K & 0x1;
+      int k = (K+1)/2 - 1;
+
+      register int16x8_t aa asm("v0");
+      register int16x8_t ab asm("v1");
+      register int16x8_t b0 asm("v2");
+      register int16x8_t b1 asm("v3");
+      register int16x8_t b2 asm("v4");
+
+      __asm __volatile (
+        "ldr %d[aa], [%x[a_ptr]]\n"  // Load A[A].lower
+        "movi v5.4s, #0\n"
+        "ldr x20, [%x[a_ptr], #0x08]\n"  // Load A[A].upper
+        "movi v6.4s, #0\n"
+        "ldr %d[b0], [%x[b_ptr]]\n"  // Load B[0].lower
+        "ins %[aa].d[1], x20\n"  // Merge A[A].lower and upper
+        "movi v7.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #64]")
+        "movi v8.4s, #0\n"
+        "ldr x20, [%x[b_ptr], #0x08]\n"  // Load B[0].upper
+        "movi v9.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #64]")
+        "movi v10.4s, #0\n"
+        "ldr %d[b1], [%x[b_ptr], #0x10]\n"  // Load B[1].lower
+        "ins %[b0].d[1], x20\n"  // Merge B[0].lower and upper
+        "movi v11.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #96]")
+        "movi v12.4s, #0\n"
+        "movi v13.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #96]")
+        "movi v14.4s, #0\n"
+        "movi v15.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #128]")
+        "movi v16.4s, #0\n"
+        "movi v17.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #128]")
+        "movi v18.4s, #0\n"
+        "movi v19.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #160]")
+        "movi v20.4s, #0\n"
+        "movi v21.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #160]")
+        "movi v22.4s, #0\n"
+        "movi v23.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #192]")
+        "movi v24.4s, #0\n"
+        "add %x[a_ptr], %x[a_ptr], #0x10\n"
+        "movi v25.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #192]")
+        "movi v26.4s, #0\n"
+        "add %x[b_ptr], %x[b_ptr], #0x18\n"
+        "movi v27.4s, #0\n"
+        "movi v28.4s, #0\n"
+
+        "cbz %x[k], 2f\n"  // Skip the loop if doing zero iterations.
+
+        "1:\n"  // Main loop
+          // First unroll
+          "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+          "ldr x20, [%x[b_ptr]]\n"  // Load B[1].upper
+          "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+          "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+          "ldr %d[ab], [%x[a_ptr]]\n"  // Load A[B].lower
+          "ins %[b1].d[1], x20\n"  // Merge B[1].lower and .upper
+          "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+          "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+          "ldr x20, [%x[a_ptr], #0x8]\n"  // Load A[B].upper
+          "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+          "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+          "ldr %d[b2], [%x[b_ptr], #0x8]\n"  // Load B[2].lower
+          "ins %[ab].d[1], x20\n"  // Merge A[B].lower and .upper
+          "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+          "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+          "ldr x20, [%x[b_ptr], #0x10]\n"  // Load B[2].upper
+          "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+          "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+          "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+          "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+          "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+          "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+          "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+          "ldr %d[b0], [%x[b_ptr], #0x18]\n"  // Load B[0].lower
+          "ins %[b2].d[1], x20\n"  // Merge B[2].lower and .upper
+          "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+          "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+          "ldr x20, [%x[b_ptr], #0x20]\n"  // Load B[0].upper
+          "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+          "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+          "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+          "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+          "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+          "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+          // Second unroll
+          "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+          "ldr %d[aa], [%x[a_ptr], #0x10]\n"  // Load A[A].lower
+          "ins %[b0].d[1], x20\n"  // Merge B[0].lower and .upper
+          "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+          "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+          "ldr x20, [%x[a_ptr], #0x18]\n"  // Load A[A].upper
+          "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+          "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+          "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+          "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+          "add %x[a_ptr], %x[a_ptr], #0x20\n"
+          "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+          "smlal v13.4s, %[b2].4h, %[ab].h[0]\n"
+          ASM_PREFETCH("[%[b_ptr], #320]")
+          "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+          "smlal v15.4s, %[b2].4h, %[ab].h[2]\n"
+          ASM_PREFETCH("[%[a_ptr], #320]")
+          "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+          "smlal v17.4s, %[b2].4h, %[ab].h[4]\n"
+          ASM_PREFETCH("[%[b_ptr], #448]")
+          "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+          "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+          "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+          "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+          "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+          "subs %x[k], %x[k], #0x1\n"
+          "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+          "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+          "ldr %d[b1], [%x[b_ptr], #0x28]\n"  // Load B[1].lower
+          "ins %[aa].d[1], x20\n"  // Merge A[A].lower and .upper
+          "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+          "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+          "add %x[b_ptr], %x[b_ptr], #0x30\n"
+          "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+          "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+          "bne 1b\n"
+
+        "2:\n"  // Even tail
+          "cbnz %x[odd_k], 3f\n"
+
+          "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+          "ldr x20, [%x[b_ptr]]\n"  // Load B[1].upper
+          "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+          "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+          "ldr %d[ab], [%x[a_ptr]]\n"  // Load A[B].lower
+          "ins %[b1].d[1], x20\n"  // Merge B[1].lower and .upper
+          "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+          "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+          "ldr x20, [%x[a_ptr], #0x8]\n"  // Load A[B].upper
+          "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+          "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+          "ldr %d[b2], [%x[b_ptr], #0x8]\n"  // Load B[2].lower
+          "ins %[ab].d[1], x20\n"  // Merge A[B].lower and .upper
+          "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+          "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+          "ldr x20, [%x[b_ptr], #0x10]\n"  // Load B[2].upper
+          "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+          "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+          "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+          "add %[a_ptr], %[a_ptr], #0x10\n"
+          "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+          "add %[b_ptr], %[b_ptr], #0x18\n"
+          "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+          "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+          "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+          "ins %[b2].d[1], x20\n"  // Merge B[2].lower and .upper
+          "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+          "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+          "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+          "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+          "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+          "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+          "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+          "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+          "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+          "smlal v13.4s, %[b2].4h, %[ab].h[0]\n"
+          "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+          "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+          "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+          "str q5, [%x[c_ptr]]\n"
+          "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+          "str q13, [%x[c_ptr], #0x10]\n"
+          "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+          "str q21, [%x[c_ptr], #0x20]\n"
+          "smlal v15.4s, %[b2].4h, %[ab].h[2]\n"
+          "str q6, [%x[c_ptr], #0x30]\n"
+          "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+          "str q14, [%x[c_ptr], #0x40]\n"
+          "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+          "str q22, [%x[c_ptr], #0x50]\n"
+          "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+          "str q7, [%x[c_ptr], #0x60]\n"
+          "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+          "str q15, [%x[c_ptr], #0x70]\n"
+          "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+          "str q23, [%x[c_ptr], #0x80]\n"
+          "smlal v17.4s, %[b2].4h, %[ab].h[4]\n"
+          "str q8, [%x[c_ptr], #0x90]\n"
+          "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+          "str q16, [%x[c_ptr], #0xa0]\n"
+          "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+          "str q24, [%x[c_ptr], #0xb0]\n"
+          "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+          "str q9, [%x[c_ptr], #0xc0]\n"
+          "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+          "str q17, [%x[c_ptr], #0xd0]\n"
+          "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+          "str q25, [%x[c_ptr], #0xe0]\n"
+          "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+          "str q10, [%x[c_ptr], #0xf0]\n"
+          "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+          "str q18, [%x[c_ptr], #0x100]\n"
+          "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+          "str q26, [%x[c_ptr], #0x110]\n"
+          "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+          "str q11, [%x[c_ptr], #0x120]\n"
+          "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+          "str q19, [%x[c_ptr], #0x130]\n"
+          "b 4f\n"  // Complete write out
+
+        "3:\n"  // Odd tail
+          "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+          "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+          "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+          "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+          "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+          "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+          "str q5, [%x[c_ptr]]\n"
+          "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+          "str q13, [%x[c_ptr], #0x10]\n"
+          "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+          "str q21, [%x[c_ptr], #0x20]\n"
+          "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+          "str q6, [%x[c_ptr], #0x30]\n"
+          "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+          "str q14, [%x[c_ptr], #0x40]\n"
+          "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+          "str q22, [%x[c_ptr], #0x50]\n"
+          "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+          "str q7, [%x[c_ptr], #0x60]\n"
+          "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+          "str q15, [%x[c_ptr], #0x70]\n"
+          "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+          "str q23, [%x[c_ptr], #0x80]\n"
+          "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+          "str q8, [%x[c_ptr], #0x90]\n"
+          "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+          "str q16, [%x[c_ptr], #0xa0]\n"
+          "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+          "str q24, [%x[c_ptr], #0xb0]\n"
+          "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+          "str q9, [%x[c_ptr], #0xc0]\n"
+          "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+          "str q17, [%x[c_ptr], #0xd0]\n"
+          "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+          "str q25, [%x[c_ptr], #0xe0]\n"
+          "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+          "str q10, [%x[c_ptr], #0xf0]\n"
+          "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+          "str q18, [%x[c_ptr], #0x100]\n"
+          "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+          "str q26, [%x[c_ptr], #0x110]\n"
+          "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+          "str q11, [%x[c_ptr], #0x120]\n"
+
+        "4:\n"  // End of function
+          "str q19, [%x[c_ptr], #0x130]\n"
+          "str q27, [%x[c_ptr], #0x140]\n"
+          "str q12, [%x[c_ptr], #0x150]\n"
+          "str q20, [%x[c_ptr], #0x160]\n"
+          "str q28, [%x[c_ptr], #0x170]\n"
+          "add %x[c_ptr], %x[c_ptr], #0x180\n"
+        : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k),
+          [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2)
+        : [odd_k] "r" (odd_k)
+        : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc"
+      );
+    }
+  }
+}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp
new file mode 100644
index 000000000..88cbb361b
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+// Load the actual kernel
+#include "a64_gemm_s8_12x8/generic.hpp"
+
+class gemm_s8_12x8 {
+public:
+    typedef int8_t operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int A_interleave = 8;
+    static const int A_block = 4;
+    static const bool A_transpose = false;
+
+    /* Same for B input */
+    static const int B_interleave = 12;
+    static const int B_block = 4;
+    static const bool B_transpose = true;
+
+    /* Kernel blocking parameters */
+    static const int out_width = 12;
+    static const int out_height = 8;
+    static const int k_unroll = 4;
+
+    kern_type kernel = nullptr;
+
+    gemm_s8_12x8(const CPUInfo *ci) {
+        kernel = a64_gemm_s8_12x8;
+    }
+};
+
+#endif // __aarch64__
+
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp
new file mode 100644
index 000000000..4ac2ba423
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+#include "dot_toolchain_support.h"
+#include <cassert>
+
+void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+    assert(Apanel);
+    assert(Bpanel);
+    assert(Cpanel);
+    const int8_t *a_ptr = Apanel;
+    int32_t *c_ptr = Cpanel;
+    // We divide K by 4 because the sdot instruction processes 4 elements at a time.
+    const int W = K/4;
+    // Fix up for odd lengths - set a flag if K is odd, but make.
+    // sure we round up the iteration count.
+    const int oddk = (W & 1);
+    const int init_value_k = ((W+1)/2) - 1;
+    for (int yb=0; yb<ablocks; yb++) {
+        const int8_t *a_ptr0 = a_ptr;
+        const int8_t *b_ptr = Bpanel;
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            int k = init_value_k;
+            register int32x4_t a0  asm("v0");
+            register int32x4_t a1  asm("v1");
+            register int32x4_t b0  asm("v2");
+            register int32x4_t b1  asm("v3");
+            register int32x4_t b2  asm("v4");
+            register int32x4_t a0a asm("v5");
+            register int32x4_t a1a asm("v6");
+            __asm __volatile (
+                _DECLARE_SDOT
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi	v8.4s, #0x0\n"
+                "ldp	%q[a0], %q[a1], [%[a_ptr]]\n"
+                "movi	v9.4s, #0x0\n"
+                "ldp	%q[b0], %q[b1], [%[b_ptr]]\n"
+                "movi	v10.4s, #0x0\n"
+                "movi	v11.4s, #0x0\n"
+                "movi	v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi	v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz	%w[k], 4f\n"
+
+
+                // Loop proper
+                "1:\n"
+                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+
+                "sdot  	v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
+
+                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "sdot	v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
+
+
+                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "ins    %[a0a].d[1], x20\n"
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
+
+                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "ins    %[a1a].d[1], x20\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
+
+                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+
+                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
+
+                "sdot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "sdot	v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "sdot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "sdot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "ldr	%d[a0], [%[a_ptr], #64]\n"
+
+                "sdot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #72]\n"
+                "sdot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "sdot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "ldr	%d[a1], [%[a_ptr], #80]\n"
+
+                "sdot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "ins    %[a0].d[1], x20\n"
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                "sdot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #88]\n"
+                "sdot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #96]\n"
+
+                "sdot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "ins    %[a1].d[1], x20\n"
+                "sdot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #104]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "sdot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #112]\n"
+
+                "sdot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "sdot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #120]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "sdot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+
+                "sdot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "sdot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "sdot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "ins    %[b1].d[1], x20\n"
+                "sdot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "bne	1b\n"
+
+                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz	%w[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
+
+                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
+
+
+                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
+
+                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+
+                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
+
+                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
+
+                "sdot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "sdot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "sdot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "ins    %[b2].d[1], x20\n"
+
+                "sdot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "sdot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "sdot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+
+                "sdot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "sdot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "sdot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "sdot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "sdot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "sdot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "sdot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "sdot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "sdot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "sdot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "sdot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "sdot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "sdot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "sdot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+                "b	3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+
+                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "ins    %[b2].d[1], x20\n"
+                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+
+                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+
+                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+
+                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+
+                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+
+                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+
+                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+
+                // Common tail
+                "3:\n"
+                "str	q23, [%[c_ptr], #352]\n"
+                "str	q31, [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+
+
+
+                ".purgem sdot\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+            );
+
+
+        }
+    }
+}
+
+#endif
+
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
new file mode 100644
index 000000000..1d6fd1623
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
+#define _DECLARE_SDOT ".altmacro\n"\
+    ".macro sdot opd:req, opn:req, opm:req\n"\
+    "local vd, vn, vm, h, l\n"\
+    ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n"\
+    ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"\
+    ".set vd,\\reg\n"\
+    ".endif\n"\
+    ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"\
+    ".set vn,\\reg\n"\
+    ".endif\n"\
+    ".irp idx,0,1,2,3\n"\
+    ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"\
+    ".set vm,\\reg\n"\
+    ".set h,\\idx / 2\n"\
+    ".set l,\\idx %% 2\n"\
+    ".endif\n"\
+    ".endr\n"\
+    ".endr\n"\
+    ".ifndef vd\n"\
+    ".error \"Bad operand \\opd\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".ifndef vn\n"\
+    ".error \"Bad operand \\opn\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".ifndef vm\n"\
+    ".error \"Bad operand \\opm\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".ifndef h\n"\
+    ".error \"Bad operand \\opm\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".ifndef l\n"\
+    ".error \"Bad operand \\opm\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".int	 0x4f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"\
+    ".endm\n"\
+
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp
new file mode 100644
index 000000000..bfad0373b
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+#include "dot_toolchain_support.h"
+#include <cassert>
+
+
+inline void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+    assert(Apanel);
+    assert(Bpanel);
+    assert(Cpanel);
+    K/=4;
+    const long int row_jump=0;
+    const long int block_jump=0;
+    const int32_t *a_ptr = reinterpret_cast<const int32_t*>(Apanel);
+    int32_t *c_ptr = reinterpret_cast<int32_t*>(Cpanel);
+    for (int yb=0; yb<ablocks; yb++) {
+        const int32_t *a_ptr0 = a_ptr;
+        const int32_t *b_ptr = reinterpret_cast<const int32_t*>(Bpanel);
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k = ((K+1)/2) - 1;
+            register int32x4_t a0  asm("v0");
+            register int32x4_t a1  asm("v1");
+            register int32x4_t b0  asm("v2");
+            register int32x4_t b1  asm("v3");
+            register int32x4_t b2  asm("v4");
+            register int32x4_t a0a asm("v5");
+            register int32x4_t a1a asm("v6");
+            __asm __volatile (
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi	v8.4s, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.4s, #0x0\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "movi	v11.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi	v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz	%w[k], 4f\n"
+
+                _DECLARE_SDOT
+
+                // Loop proper
+                "1:\n"
+                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "sdot  	v9.4s , %[b0].16b, %[a0].4b[1]\n"
+
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr	%q[a0a], [%[a_ptr], #32]\n"
+                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "sdot	v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr	%q[a1a], [%[a_ptr], #48]\n"
+                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
+
+                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
+
+                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
+
+                "sdot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "sdot	v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr	%q[a0], [%[a_ptr], #64]\n"
+                "sdot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "sdot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "sdot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "ldr	%q[a1], [%[a_ptr], #80]\n"
+                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "sdot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "sdot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #96]\n"
+
+                "sdot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "sdot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                "sdot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "sdot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "sdot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "sdot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #112]\n"
+
+                "sdot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "sdot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "sdot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "sdot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "sdot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "sdot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "sdot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "bne	1b\n"
+
+                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz	%w[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr	%q[a0a], [%[a_ptr], #32]\n"
+                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr	%q[a1a], [%[a_ptr], #48]\n"
+                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
+
+                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
+
+                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
+
+                "sdot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+
+                "add	%[b_ptr], %[b_ptr], %[block_jump]\n"
+                "sdot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "sdot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "sdot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "sdot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "str	q24, [%[c_ptr], #32]\n"
+
+                "sdot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                "sdot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "sdot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "sdot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+
+                "sdot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+
+                "sdot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "sdot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "sdot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+
+                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "sdot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "sdot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+
+                "sdot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "sdot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+
+                "sdot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "sdot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "sdot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+                "b	3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+
+                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+
+                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+
+                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+
+                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+
+                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+
+                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+
+                // Common tail
+                "3:\n"
+                "str	q23, [%[c_ptr], #352]\n"
+                "str	q31, [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+
+                ".purgem sdot\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+            );
+        }
+    }
+
+
+}
+
+
+#endif 
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp
new file mode 100644
index 000000000..1588f049f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+// Load the actual kernel
+#include "a64_gemm_s8_4x4/generic.hpp"
+
+class gemm_s8_4x4 {
+public:
+    typedef int8_t operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int A_interleave = 4;
+    static const int A_block = 16;
+    static const bool A_transpose = false;
+
+    /* Same for B input */
+    static const int B_interleave = 4;
+    static const int B_block = 16;
+    static const bool B_transpose = true;
+
+    /* Kernel blocking parameters */
+    static const int out_width = 4;
+    static const int out_height = 4;
+    static const int k_unroll = 16;
+
+    kern_type kernel = nullptr;
+
+    gemm_s8_4x4(const CPUInfo *ci) {
+        kernel = a64_gemm_s8_4x4;
+    }
+};
+
+#endif // __aarch64__
+
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp
new file mode 100644
index 000000000..0ec435b33
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+inline void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+    const int8_t *a_ptr = Apanel;
+    int32_t *c_ptr = Cpanel;
+    K /= 16;
+    int oddk = (K & 1);
+
+    for (int yb=0; yb<ablocks; yb++) {
+        const int8_t *a_ptr0 = a_ptr;
+        const int8_t *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+
+            int k = ((K+1)/2)-1;
+
+            register int8x16_t b0  asm("v4");
+            register int8x16_t b1  asm("v5");
+            register int8x16_t b2  asm("v6");
+            register int8x16_t b3  asm("v7");
+            register int8x16_t b0a asm("v8");
+            register int8x16_t b1a asm("v9");
+            register int8x16_t b2a asm("v10");
+            register int8x16_t b3a asm("v11");
+
+            __asm __volatile (
+                "movi	v16.4s, #0x0\n"
+                "ldr	q0, [%[a_ptr]]\n"
+                "movi	v17.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v18.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v19.4s, #0x0\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "movi	v20.4s, #0x0\n"
+                "ldr	%q[b3], [%[b_ptr], #48]\n"
+                "movi	v21.4s, #0x0\n"
+                "ldr	q1, [%[a_ptr], #16]\n"
+                "movi	v22.4s, #0x0\n"
+                "ldr	q2, [%[a_ptr], #32]\n"
+                "movi	v23.4s, #0x0\n"
+                "ldr	q3, [%[a_ptr], #48]\n"
+                "movi	v24.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v25.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v26.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v27.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v28.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v29.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v30.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v31.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+
+                // Loop structure optimized for A57 (after r0).
+
+                // Unavoidably, the multiply will "dribble" if
+                // dual issued with an add.
+
+                // Minimize the effect of this by making sure
+                // there are 2 adds to run under the dribbled
+                // multiply.
+
+                // Pipeline in blocks of 8 multiplies - combine
+                // this iteration's multiplies with adds from
+                // the previous iteration.
+
+                // So the first block doesn't have any adds to
+                // do - but because all the adds are at the
+                // start of the block it's only the first couple
+                // of multiplies that need to be pulled out.
+
+                // Start of unroll 0 (first iteration)
+                "smull	v12.8h, v0.8b, %[b0].8b\n"
+                "smull	v13.8h, v0.8b, %[b1].8b\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz	%w[k], 4f\n"
+
+                // Unroll 0 continuation (branch target)
+                "1:\n"
+                "smull	v14.8h, v0.8b, %[b2].8b\n"
+                "subs	%w[k], %w[k], #1\n"
+                "smull	v15.8h, v0.8b, %[b3].8b\n"
+                "ldr	%q[b0a], [%[b_ptr], #64]\n"
+                "smlal2	v12.8h, v0.16b, %[b0].16b\n"
+                "smlal2	v13.8h, v0.16b, %[b1].16b\n"
+                "ldr	%q[b1a], [%[b_ptr], #80]\n"
+                "smlal2	v14.8h, v0.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v0.16b, %[b3].16b\n"
+                "ldr 	q0, [%[a_ptr], #64]\n"
+
+                "sadalp	v16.4s, v12.8h\n"
+                "smull	v12.8h, v1.8b, %[b0].8b\n"
+                "sadalp	v17.4s, v13.8h\n"
+                "sadalp	v18.4s, v14.8h\n"
+                "smull	v13.8h, v1.8b, %[b1].8b\n"
+                "sadalp	v19.4s, v15.8h\n"
+                "smull	v14.8h, v1.8b, %[b2].8b\n"
+                "ldr	%q[b2a], [%[b_ptr], #96]\n"
+                "smull	v15.8h, v1.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v1.16b, %[b0].16b\n"
+                "ldr	%q[b3a], [%[b_ptr], #112]\n"
+                "smlal2	v13.8h, v1.16b, %[b1].16b\n"
+                "add	%[b_ptr], %[b_ptr], #128\n"
+                "smlal2	v14.8h, v1.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v1.16b, %[b3].16b\n"
+                "ldr 	q1, [%[a_ptr], #80]\n"
+
+                "sadalp	v20.4s, v12.8h\n"
+                "smull	v12.8h, v2.8b, %[b0].8b\n"
+                "sadalp	v21.4s, v13.8h\n"
+                "sadalp	v22.4s, v14.8h\n"
+                "smull	v13.8h, v2.8b, %[b1].8b\n"
+                "sadalp	v23.4s, v15.8h\n"
+                "smull	v14.8h, v2.8b, %[b2].8b\n"
+                "smull	v15.8h, v2.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v2.16b, %[b0].16b\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "smlal2	v13.8h, v2.16b, %[b1].16b\n"
+                "smlal2	v14.8h, v2.16b, %[b2].16b\n"
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "smlal2	v15.8h, v2.16b, %[b3].16b\n"
+                "ldr 	q2, [%[a_ptr], #96]\n"
+
+                "sadalp	v24.4s, v12.8h\n"
+                "smull	v12.8h, v3.8b, %[b0].8b\n"
+                "sadalp	v25.4s, v13.8h\n"
+                "sadalp	v26.4s, v14.8h\n"
+                "smull	v13.8h, v3.8b, %[b1].8b\n"
+                "sadalp	v27.4s, v15.8h\n"
+                "smull	v14.8h, v3.8b, %[b2].8b\n"
+                "smull	v15.8h, v3.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v3.16b, %[b0].16b\n"
+                "ldr 	%q[b0], [%[b_ptr], #0]\n"
+                "smlal2	v13.8h, v3.16b, %[b1].16b\n"
+                "smlal2	v14.8h, v3.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v3.16b, %[b3].16b\n"
+                "ldr 	q3, [%[a_ptr], #112]\n"
+
+                // Unroll 1
+                "sadalp	v28.4s, v12.8h\n"
+                "smull	v12.8h, v0.8b, %[b0a].8b\n"
+                "sadalp	v29.4s, v13.8h\n"
+                "sadalp	v30.4s, v14.8h\n"
+                "smull	v13.8h, v0.8b, %[b1a].8b\n"
+                "sadalp	v31.4s, v15.8h\n"
+                "smull	v14.8h, v0.8b, %[b2a].8b\n"
+                "smull	v15.8h, v0.8b, %[b3a].8b\n"
+                "ldr 	%q[b1], [%[b_ptr], #16]\n"
+                "smlal2	v12.8h, v0.16b, %[b0a].16b\n"
+                "smlal2	v13.8h, v0.16b, %[b1a].16b\n"
+                "ldr 	%q[b2], [%[b_ptr], #32]\n"
+                "smlal2	v14.8h, v0.16b, %[b2a].16b\n"
+                "smlal2	v15.8h, v0.16b, %[b3a].16b\n"
+                "ldr 	q0, [%[a_ptr], #128]\n"
+
+                "sadalp	v16.4s, v12.8h\n"
+                "smull	v12.8h, v1.8b, %[b0a].8b\n"
+                "sadalp	v17.4s, v13.8h\n"
+                "sadalp	v18.4s, v14.8h\n"
+                "smull	v13.8h, v1.8b, %[b1a].8b\n"
+                "sadalp	v19.4s, v15.8h\n"
+                "add	%[a_ptr], %[a_ptr], #128\n"
+                "smull	v14.8h, v1.8b, %[b2a].8b\n"
+                "smull	v15.8h, v1.8b, %[b3a].8b\n"
+                "ldr 	%q[b3], [%[b_ptr], #48]\n"
+                "smlal2	v12.8h, v1.16b, %[b0a].16b\n"
+                "smlal2	v13.8h, v1.16b, %[b1a].16b\n"
+                "smlal2	v14.8h, v1.16b, %[b2a].16b\n"
+                "smlal2	v15.8h, v1.16b, %[b3a].16b\n"
+                "ldr 	q1, [%[a_ptr], #16]\n"
+
+                "sadalp	v20.4s, v12.8h\n"
+                "smull	v12.8h, v2.8b, %[b0a].8b\n"
+                "sadalp	v21.4s, v13.8h\n"
+                "sadalp	v22.4s, v14.8h\n"
+                "smull	v13.8h, v2.8b, %[b1a].8b\n"
+                "sadalp	v23.4s, v15.8h\n"
+                "smull	v14.8h, v2.8b, %[b2a].8b\n"
+                "smull	v15.8h, v2.8b, %[b3a].8b\n"
+                "smlal2	v12.8h, v2.16b, %[b0a].16b\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "smlal2	v13.8h, v2.16b, %[b1a].16b\n"
+                "smlal2	v14.8h, v2.16b, %[b2a].16b\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "smlal2	v15.8h, v2.16b, %[b3a].16b\n"
+                "ldr 	q2, [%[a_ptr], #32]\n"
+
+                "sadalp	v24.4s, v12.8h\n"
+                "smull	v12.8h, v3.8b, %[b0a].8b\n"
+                "sadalp	v25.4s, v13.8h\n"
+                "sadalp	v26.4s, v14.8h\n"
+                "smull	v13.8h, v3.8b, %[b1a].8b\n"
+                "sadalp	v27.4s, v15.8h\n"
+                "smull	v14.8h, v3.8b, %[b2a].8b\n"
+                "smull	v15.8h, v3.8b, %[b3a].8b\n"
+                "smlal2	v12.8h, v3.16b, %[b0a].16b\n"
+                "smlal2	v13.8h, v3.16b, %[b1a].16b\n"
+                "smlal2	v14.8h, v3.16b, %[b2a].16b\n"
+                "smlal2	v15.8h, v3.16b, %[b3a].16b\n"
+                "ldr 	q3, [%[a_ptr], #48]\n"
+
+                // Start of unroll 0 for next iteration.
+                "sadalp	v28.4s, v12.8h\n"
+                "smull	v12.8h, v0.8b, %[b0].8b\n"
+                "sadalp	v29.4s, v13.8h\n"
+                "sadalp	v30.4s, v14.8h\n"
+                "smull	v13.8h, v0.8b, %[b1].8b\n"
+                "sadalp	v31.4s, v15.8h\n"
+                "bne	1b\n"
+
+                // Target to use when K=1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz	%w[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                "smull	v14.8h, v0.8b, %[b2].8b\n"
+                "smull	v15.8h, v0.8b, %[b3].8b\n"
+                "ldr	%q[b0a], [%[b_ptr], #64]\n"
+                "smlal2	v12.8h, v0.16b, %[b0].16b\n"
+                "smlal2	v13.8h, v0.16b, %[b1].16b\n"
+                "ldr	%q[b1a], [%[b_ptr], #80]\n"
+                "smlal2	v14.8h, v0.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v0.16b, %[b3].16b\n"
+                "ldr 	q0, [%[a_ptr], #64]\n"
+
+                "sadalp	v16.4s, v12.8h\n"
+                "smull	v12.8h, v1.8b, %[b0].8b\n"
+                "sadalp	v17.4s, v13.8h\n"
+                "sadalp	v18.4s, v14.8h\n"
+                "smull	v13.8h, v1.8b, %[b1].8b\n"
+                "sadalp	v19.4s, v15.8h\n"
+                "smull	v14.8h, v1.8b, %[b2].8b\n"
+                "ldr	%q[b2a], [%[b_ptr], #96]\n"
+                "smull	v15.8h, v1.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v1.16b, %[b0].16b\n"
+                "ldr	%q[b3a], [%[b_ptr], #112]\n"
+                "smlal2	v13.8h, v1.16b, %[b1].16b\n"
+                "add	%[b_ptr], %[b_ptr], #128\n"
+                "smlal2	v14.8h, v1.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v1.16b, %[b3].16b\n"
+                "ldr 	q1, [%[a_ptr], #80]\n"
+
+                "sadalp	v20.4s, v12.8h\n"
+                "smull	v12.8h, v2.8b, %[b0].8b\n"
+                "sadalp	v21.4s, v13.8h\n"
+                "sadalp	v22.4s, v14.8h\n"
+                "smull	v13.8h, v2.8b, %[b1].8b\n"
+                "sadalp	v23.4s, v15.8h\n"
+                "smull	v14.8h, v2.8b, %[b2].8b\n"
+                "smull	v15.8h, v2.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v2.16b, %[b0].16b\n"
+                "smlal2	v13.8h, v2.16b, %[b1].16b\n"
+                "smlal2	v14.8h, v2.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v2.16b, %[b3].16b\n"
+                "ldr 	q2, [%[a_ptr], #96]\n"
+
+                "sadalp	v24.4s, v12.8h\n"
+                "smull	v12.8h, v3.8b, %[b0].8b\n"
+                "sadalp	v25.4s, v13.8h\n"
+                "sadalp	v26.4s, v14.8h\n"
+                "smull	v13.8h, v3.8b, %[b1].8b\n"
+                "sadalp	v27.4s, v15.8h\n"
+                "smull	v14.8h, v3.8b, %[b2].8b\n"
+                "smull	v15.8h, v3.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v3.16b, %[b0].16b\n"
+                "smlal2	v13.8h, v3.16b, %[b1].16b\n"
+                "smlal2	v14.8h, v3.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v3.16b, %[b3].16b\n"
+                "ldr 	q3, [%[a_ptr], #112]\n"
+
+                // Unroll 1
+                "sadalp	v28.4s, v12.8h\n"
+                "smull	v12.8h, v0.8b, %[b0a].8b\n"
+                "sadalp	v29.4s, v13.8h\n"
+                "sadalp	v30.4s, v14.8h\n"
+                "smull	v13.8h, v0.8b, %[b1a].8b\n"
+                "sadalp	v31.4s, v15.8h\n"
+                "smull	v14.8h, v0.8b, %[b2a].8b\n"
+                "add	%[a_ptr], %[a_ptr], #128\n"
+                "smull	v15.8h, v0.8b, %[b3a].8b\n"
+                "smlal2	v12.8h, v0.16b, %[b0a].16b\n"
+                "smlal2	v13.8h, v0.16b, %[b1a].16b\n"
+                "smlal2	v14.8h, v0.16b, %[b2a].16b\n"
+                "smlal2	v15.8h, v0.16b, %[b3a].16b\n"
+
+                "sadalp	v16.4s, v12.8h\n"
+                "smull	v12.8h, v1.8b, %[b0a].8b\n"
+                "sadalp	v17.4s, v13.8h\n"
+                "sadalp	v18.4s, v14.8h\n"
+                "smull	v13.8h, v1.8b, %[b1a].8b\n"
+                "sadalp	v19.4s, v15.8h\n"
+                "smull	v14.8h, v1.8b, %[b2a].8b\n"
+                "smull	v15.8h, v1.8b, %[b3a].8b\n"
+                "smlal2	v12.8h, v1.16b, %[b0a].16b\n"
+                "addp	v16.4s, v16.4s, v17.4s\n"
+                "smlal2	v13.8h, v1.16b, %[b1a].16b\n"
+                "addp	v17.4s, v18.4s, v19.4s\n"
+                "smlal2	v14.8h, v1.16b, %[b2a].16b\n"
+                "smlal2	v15.8h, v1.16b, %[b3a].16b\n"
+
+                "sadalp	v20.4s, v12.8h\n"
+                "smull	v12.8h, v2.8b, %[b0a].8b\n"
+                "sadalp	v21.4s, v13.8h\n"
+                "sadalp	v22.4s, v14.8h\n"
+                "smull	v13.8h, v2.8b, %[b1a].8b\n"
+                "sadalp	v23.4s, v15.8h\n"
+                "addp	v16.4s, v16.4s, v17.4s\n"
+                "smull	v14.8h, v2.8b, %[b2a].8b\n"
+                "addp	v18.4s, v20.4s, v21.4s\n"
+                "addp	v19.4s, v22.4s, v23.4s\n"
+                "smull	v15.8h, v2.8b, %[b3a].8b\n"
+                "smlal2	v12.8h, v2.16b, %[b0a].16b\n"
+                "str	q16, [%[c_ptr]]\n"
+                "smlal2	v13.8h, v2.16b, %[b1a].16b\n"
+                "smlal2	v14.8h, v2.16b, %[b2a].16b\n"
+                "smlal2	v15.8h, v2.16b, %[b3a].16b\n"
+
+                "sadalp	v24.4s, v12.8h\n"
+                "smull	v12.8h, v3.8b, %[b0a].8b\n"
+                "sadalp	v25.4s, v13.8h\n"
+                "sadalp	v26.4s, v14.8h\n"
+                "smull	v13.8h, v3.8b, %[b1a].8b\n"
+                "sadalp	v27.4s, v15.8h\n"
+                "addp	v17.4s, v18.4s, v19.4s\n"
+                "smull	v14.8h, v3.8b, %[b2a].8b\n"
+                "addp	v20.4s, v24.4s, v25.4s\n"
+                "addp	v21.4s, v26.4s, v27.4s\n"
+                "smull	v15.8h, v3.8b, %[b3a].8b\n"
+                "smlal2	v12.8h, v3.16b, %[b0a].16b\n"
+                "str	q17, [%[c_ptr], #16]\n"
+                "smlal2	v13.8h, v3.16b, %[b1a].16b\n"
+                "smlal2	v14.8h, v3.16b, %[b2a].16b\n"
+                "addp	v18.4s, v20.4s, v21.4s\n"
+                "smlal2	v15.8h, v3.16b, %[b3a].16b\n"
+                "b	3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                "smull	v14.8h, v0.8b, %[b2].8b\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "smull	v15.8h, v0.8b, %[b3].8b\n"
+                "add	%[b_ptr], %[b_ptr], #64\n"
+                "smlal2	v12.8h, v0.16b, %[b0].16b\n"
+                "smlal2	v13.8h, v0.16b, %[b1].16b\n"
+                "smlal2	v14.8h, v0.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v0.16b, %[b3].16b\n"
+
+                "sadalp	v16.4s, v12.8h\n"
+                "smull	v12.8h, v1.8b, %[b0].8b\n"
+                "sadalp	v17.4s, v13.8h\n"
+                "sadalp	v18.4s, v14.8h\n"
+                "smull	v13.8h, v1.8b, %[b1].8b\n"
+                "sadalp	v19.4s, v15.8h\n"
+                "smull	v14.8h, v1.8b, %[b2].8b\n"
+                "smull	v15.8h, v1.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v1.16b, %[b0].16b\n"
+                "addp	v16.4s, v16.4s, v17.4s\n"
+                "smlal2	v13.8h, v1.16b, %[b1].16b\n"
+                "addp	v17.4s, v18.4s, v19.4s\n"
+                "smlal2	v14.8h, v1.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v1.16b, %[b3].16b\n"
+
+                "sadalp	v20.4s, v12.8h\n"
+                "smull	v12.8h, v2.8b, %[b0].8b\n"
+                "sadalp	v21.4s, v13.8h\n"
+                "sadalp	v22.4s, v14.8h\n"
+                "smull	v13.8h, v2.8b, %[b1].8b\n"
+                "sadalp	v23.4s, v15.8h\n"
+                "addp	v16.4s, v16.4s, v17.4s\n"
+                "smull	v14.8h, v2.8b, %[b2].8b\n"
+                "addp	v18.4s, v20.4s, v21.4s\n"
+                "addp	v19.4s, v22.4s, v23.4s\n"
+                "smull	v15.8h, v2.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v2.16b, %[b0].16b\n"
+                "str	q16, [%[c_ptr]]\n"
+                "smlal2	v13.8h, v2.16b, %[b1].16b\n"
+                "smlal2	v14.8h, v2.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v2.16b, %[b3].16b\n"
+
+                "sadalp	v24.4s, v12.8h\n"
+                "smull	v12.8h, v3.8b, %[b0].8b\n"
+                "sadalp	v25.4s, v13.8h\n"
+                "sadalp	v26.4s, v14.8h\n"
+                "smull	v13.8h, v3.8b, %[b1].8b\n"
+                "sadalp	v27.4s, v15.8h\n"
+                "addp	v17.4s, v18.4s, v19.4s\n"
+                "smull	v14.8h, v3.8b, %[b2].8b\n"
+                "addp	v20.4s, v24.4s, v25.4s\n"
+                "addp	v21.4s, v26.4s, v27.4s\n"
+                "smull	v15.8h, v3.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v3.16b, %[b0].16b\n"
+                "str	q17, [%[c_ptr], #16]\n"
+                "smlal2	v13.8h, v3.16b, %[b1].16b\n"
+                "smlal2	v14.8h, v3.16b, %[b2].16b\n"
+                "addp	v18.4s, v20.4s, v21.4s\n"
+                "smlal2	v15.8h, v3.16b, %[b3].16b\n"
+
+                "3:\n"
+
+                // Final additions
+                "sadalp	v28.4s, v12.8h\n"
+                "str	q18, [%[c_ptr], #32]\n"
+                "sadalp	v29.4s, v13.8h\n"
+                "sadalp	v30.4s, v14.8h\n"
+                "sadalp	v31.4s, v15.8h\n"
+
+                // Horizontal reduction, phase 1
+                "addp	v22.4s, v28.4s, v29.4s\n"
+                "addp	v23.4s, v30.4s, v31.4s\n"
+
+                // Horizontal reduction, phase 2
+                "addp	v19.4s, v22.4s, v23.4s\n"
+                "str	q19, [%[c_ptr], #48]\n"
+                "add	%[c_ptr], %[c_ptr], #64\n"
+
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3),
+              [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a), [b3a] "+w" (b3a),
+              [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19",
+              "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc");
+        }
+    }
+}
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp
new file mode 100644
index 000000000..7eb8b2dac
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+// Actual kernel implementations
+#include "a64_gemm_u16_12x8/generic.hpp"
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics.  The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class gemm_u16_12x8 {
+public:
+    typedef uint16_t operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int A_interleave = 8;
+    static const int A_block = 1;
+    static const int A_transpose = 0;
+
+    /* Same for B input */
+    static const int B_interleave = 12;
+    static const int B_block = 1;
+    static const int B_transpose = 1;
+
+    /* Kernel blocking parameters */
+    static const int out_width = 12;
+    static const int out_height = 8;
+    static const int k_unroll = 1;
+
+    kern_type kernel = nullptr;
+
+    gemm_u16_12x8(const CPUInfo *ci) {
+        kernel = a64_gemm_u16_asimd_12x8;
+    }
+};
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp
new file mode 100644
index 000000000..b3f310ce6
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#include <arm_neon.h>
+
+inline void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+  const uint16_t *a_ptr = Apanel;
+  uint32_t *c_ptr = Cpanel;
+
+  for (int yb = 0; yb < ablocks; yb++)
+  {
+    const uint16_t *a_ptr0 = a_ptr;
+    const uint16_t *b_ptr = Bpanel;
+
+    for (int xb = 0; xb < bblocks; xb++)
+    {
+      a_ptr = a_ptr0;
+      const bool odd_k = K & 0x1;
+      int k = (K+1)/2 - 1;
+
+      register uint16x8_t aa asm("v0");
+      register uint16x8_t ab asm("v1");
+      register uint16x8_t b0 asm("v2");
+      register uint16x8_t b1 asm("v3");
+      register uint16x8_t b2 asm("v4");
+
+      __asm __volatile (
+        "ldr %d[aa], [%x[a_ptr]]\n"  // Load A[A].lower
+        "movi v5.4s, #0\n"
+        "ldr x20, [%x[a_ptr], #0x08]\n"  // Load A[A].upper
+        "movi v6.4s, #0\n"
+        "ldr %d[b0], [%x[b_ptr]]\n"  // Load B[0].lower
+        "ins %[aa].d[1], x20\n"  // Merge A[A].lower and upper
+        "movi v7.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #64]")
+        "movi v8.4s, #0\n"
+        "ldr x20, [%x[b_ptr], #0x08]\n"  // Load B[0].upper
+        "movi v9.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #64]")
+        "movi v10.4s, #0\n"
+        "ldr %d[b1], [%x[b_ptr], #0x10]\n"  // Load B[1].lower
+        "ins %[b0].d[1], x20\n"  // Merge B[0].lower and upper
+        "movi v11.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #96]")
+        "movi v12.4s, #0\n"
+        "movi v13.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #96]")
+        "movi v14.4s, #0\n"
+        "movi v15.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #128]")
+        "movi v16.4s, #0\n"
+        "movi v17.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #128]")
+        "movi v18.4s, #0\n"
+        "movi v19.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #160]")
+        "movi v20.4s, #0\n"
+        "movi v21.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #160]")
+        "movi v22.4s, #0\n"
+        "movi v23.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #192]")
+        "movi v24.4s, #0\n"
+        "add %x[a_ptr], %x[a_ptr], #0x10\n"
+        "movi v25.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #192]")
+        "movi v26.4s, #0\n"
+        "add %x[b_ptr], %x[b_ptr], #0x18\n"
+        "movi v27.4s, #0\n"
+        "movi v28.4s, #0\n"
+
+        "cbz %x[k], 2f\n"  // Skip the loop if doing zero iterations.
+
+        "1:\n"  // Main loop
+          // First unroll
+          "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+          "ldr x20, [%x[b_ptr]]\n"  // Load B[1].upper
+          "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+          "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+          "ldr %d[ab], [%x[a_ptr]]\n"  // Load A[B].lower
+          "ins %[b1].d[1], x20\n"  // Merge B[1].lower and .upper
+          "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+          "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+          "ldr x20, [%x[a_ptr], #0x8]\n"  // Load A[B].upper
+          "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+          "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+          "ldr %d[b2], [%x[b_ptr], #0x8]\n"  // Load B[2].lower
+          "ins %[ab].d[1], x20\n"  // Merge A[B].lower and .upper
+          "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+          "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+          "ldr x20, [%x[b_ptr], #0x10]\n"  // Load B[2].upper
+          "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+          "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+          "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+          "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+          "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+          "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+          "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+          "ldr %d[b0], [%x[b_ptr], #0x18]\n"  // Load B[0].lower
+          "ins %[b2].d[1], x20\n"  // Merge B[2].lower and .upper
+          "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+          "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+          "ldr x20, [%x[b_ptr], #0x20]\n"  // Load B[0].upper
+          "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+          "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+          "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+          "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+          "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+          "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+          // Second unroll
+          "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+          "ldr %d[aa], [%x[a_ptr], #0x10]\n"  // Load A[A].lower
+          "ins %[b0].d[1], x20\n"  // Merge B[0].lower and .upper
+          "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+          "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+          "ldr x20, [%x[a_ptr], #0x18]\n"  // Load A[A].upper
+          "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+          "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+          "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+          "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+          "add %x[a_ptr], %x[a_ptr], #0x20\n"
+          "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+          "umlal v13.4s, %[b2].4h, %[ab].h[0]\n"
+          ASM_PREFETCH("[%[b_ptr], #320]")
+          "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+          "umlal v15.4s, %[b2].4h, %[ab].h[2]\n"
+          ASM_PREFETCH("[%[a_ptr], #320]")
+          "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+          "umlal v17.4s, %[b2].4h, %[ab].h[4]\n"
+          ASM_PREFETCH("[%[b_ptr], #448]")
+          "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+          "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+          "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+          "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+          "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+          "subs %x[k], %x[k], #0x1\n"
+          "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+          "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+          "ldr %d[b1], [%x[b_ptr], #0x28]\n"  // Load B[1].lower
+          "ins %[aa].d[1], x20\n"  // Merge A[A].lower and .upper
+          "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+          "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+          "add %x[b_ptr], %x[b_ptr], #0x30\n"
+          "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+          "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+          "bne 1b\n"
+
+        "2:\n"  // Even tail
+          "cbnz %x[odd_k], 3f\n"
+
+          "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+          "ldr x20, [%x[b_ptr]]\n"  // Load B[1].upper
+          "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+          "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+          "ldr %d[ab], [%x[a_ptr]]\n"  // Load A[B].lower
+          "ins %[b1].d[1], x20\n"  // Merge B[1].lower and .upper
+          "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+          "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+          "ldr x20, [%x[a_ptr], #0x8]\n"  // Load A[B].upper
+          "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+          "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+          "ldr %d[b2], [%x[b_ptr], #0x8]\n"  // Load B[2].lower
+          "ins %[ab].d[1], x20\n"  // Merge A[B].lower and .upper
+          "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+          "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+          "ldr x20, [%x[b_ptr], #0x10]\n"  // Load B[2].upper
+          "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+          "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+          "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+          "add %[a_ptr], %[a_ptr], #0x10\n"
+          "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+          "add %[b_ptr], %[b_ptr], #0x18\n"
+          "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+          "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+          "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+          "ins %[b2].d[1], x20\n"  // Merge B[2].lower and .upper
+          "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+          "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+          "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+          "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+          "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+          "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+          "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+          "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+          "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+          "umlal v13.4s, %[b2].4h, %[ab].h[0]\n"
+          "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+          "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+          "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+          "str q5, [%x[c_ptr]]\n"
+          "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+          "str q13, [%x[c_ptr], #0x10]\n"
+          "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+          "str q21, [%x[c_ptr], #0x20]\n"
+          "umlal v15.4s, %[b2].4h, %[ab].h[2]\n"
+          "str q6, [%x[c_ptr], #0x30]\n"
+          "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+          "str q14, [%x[c_ptr], #0x40]\n"
+          "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+          "str q22, [%x[c_ptr], #0x50]\n"
+          "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+          "str q7, [%x[c_ptr], #0x60]\n"
+          "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+          "str q15, [%x[c_ptr], #0x70]\n"
+          "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+          "str q23, [%x[c_ptr], #0x80]\n"
+          "umlal v17.4s, %[b2].4h, %[ab].h[4]\n"
+          "str q8, [%x[c_ptr], #0x90]\n"
+          "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+          "str q16, [%x[c_ptr], #0xa0]\n"
+          "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+          "str q24, [%x[c_ptr], #0xb0]\n"
+          "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+          "str q9, [%x[c_ptr], #0xc0]\n"
+          "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+          "str q17, [%x[c_ptr], #0xd0]\n"
+          "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+          "str q25, [%x[c_ptr], #0xe0]\n"
+          "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+          "str q10, [%x[c_ptr], #0xf0]\n"
+          "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+          "str q18, [%x[c_ptr], #0x100]\n"
+          "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+          "str q26, [%x[c_ptr], #0x110]\n"
+          "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+          "str q11, [%x[c_ptr], #0x120]\n"
+          "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+          "str q19, [%x[c_ptr], #0x130]\n"
+          "b 4f\n"  // Complete write out
+
+        "3:\n"  // Odd tail
+          "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+          "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+          "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+          "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+          "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+          "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+          "str q5, [%x[c_ptr]]\n"
+          "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+          "str q13, [%x[c_ptr], #0x10]\n"
+          "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+          "str q21, [%x[c_ptr], #0x20]\n"
+          "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+          "str q6, [%x[c_ptr], #0x30]\n"
+          "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+          "str q14, [%x[c_ptr], #0x40]\n"
+          "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+          "str q22, [%x[c_ptr], #0x50]\n"
+          "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+          "str q7, [%x[c_ptr], #0x60]\n"
+          "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+          "str q15, [%x[c_ptr], #0x70]\n"
+          "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+          "str q23, [%x[c_ptr], #0x80]\n"
+          "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+          "str q8, [%x[c_ptr], #0x90]\n"
+          "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+          "str q16, [%x[c_ptr], #0xa0]\n"
+          "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+          "str q24, [%x[c_ptr], #0xb0]\n"
+          "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+          "str q9, [%x[c_ptr], #0xc0]\n"
+          "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+          "str q17, [%x[c_ptr], #0xd0]\n"
+          "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+          "str q25, [%x[c_ptr], #0xe0]\n"
+          "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+          "str q10, [%x[c_ptr], #0xf0]\n"
+          "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+          "str q18, [%x[c_ptr], #0x100]\n"
+          "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+          "str q26, [%x[c_ptr], #0x110]\n"
+          "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+          "str q11, [%x[c_ptr], #0x120]\n"
+
+        "4:\n"  // End of function
+          "str q19, [%x[c_ptr], #0x130]\n"
+          "str q27, [%x[c_ptr], #0x140]\n"
+          "str q12, [%x[c_ptr], #0x150]\n"
+          "str q20, [%x[c_ptr], #0x160]\n"
+          "str q28, [%x[c_ptr], #0x170]\n"
+          "add %x[c_ptr], %x[c_ptr], #0x180\n"
+        : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k),
+          [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2)
+        : [odd_k] "r" (odd_k)
+        : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc"
+      );
+    }
+  }
+}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp
new file mode 100644
index 000000000..62cd747d7
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+// Load the actual kernel
+#include "a64_gemm_u8_12x8/generic.hpp"
+#include "a64_gemm_u8_12x8/a55r1.hpp"
+
+class gemm_u8_12x8 {
+public:
+    typedef uint8_t operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int A_interleave = 8;
+    static const int A_block = 4;
+    static const bool A_transpose = false;
+
+    /* Same for B input */
+    static const int B_interleave = 12;
+    static const int B_block = 4;
+    static const bool B_transpose = true;
+
+    /* Kernel blocking parameters */
+    static const int out_width = 12;
+    static const int out_height = 8;
+    static const int k_unroll = 4;
+
+    kern_type kernel = nullptr;
+
+    gemm_u8_12x8(const CPUInfo *ci) {
+        kernel = a64_gemm_u8_12x8;
+        if (ci->CPU == CPUTarget::A55_DOT) {
+            kernel = a64_gemm_u8_12x8_a55r1;
+        }
+    }
+};
+
+#endif // __aarch64__
+
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp
new file mode 100644
index 000000000..c7c2acbb4
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+#include "dot_toolchain_support.h"
+#include <cassert>
+
+inline void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+    assert(Apanel);
+    assert(Bpanel);
+    assert(Cpanel);
+    const uint8_t *a_ptr = Apanel;
+    uint32_t *c_ptr = Cpanel;
+    // We divide K by 4 because the udot instruction processes 4 elements at a time.
+    const int W = K/4;
+    // Fix up for odd lengths - set a flag if K is odd, but make
+    // sure we round up the iteration count.
+    const int oddk = (W & 1);
+    const int init_value_k = ((W+1)/2) - 1;
+    for (int yb=0; yb<ablocks; yb++) {
+        const uint8_t *a_ptr0 = a_ptr;
+        const uint8_t *b_ptr = Bpanel;
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            int k = init_value_k;
+            register int32x4_t a0  asm("v0");
+            register int32x4_t a1  asm("v1");
+            register int32x4_t b0  asm("v2");
+            register int32x4_t b1  asm("v3");
+            register int32x4_t b2  asm("v4");
+            register int32x4_t a0a asm("v5");
+            register int32x4_t a1a asm("v6");
+            __asm __volatile (
+                _DECLARE_UDOT
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi	v8.4s, #0x0\n"
+                "ldp	%q[a0], %q[a1], [%[a_ptr]]\n"
+                "movi	v9.4s, #0x0\n"
+                "ldp	%q[b0], %q[b1], [%[b_ptr]]\n"
+                "movi	v10.4s, #0x0\n"
+                "movi	v11.4s, #0x0\n"
+                "movi	v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi	v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz	%w[k], 4f\n"
+
+
+                // Loop proper
+                "1:\n"
+                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+
+                "udot  	v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
+
+                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "udot	v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
+
+
+                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "ins    %[a0a].d[1], x20\n"
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
+
+                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "ins    %[a1a].d[1], x20\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
+
+                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+
+                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
+
+                "udot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "udot	v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "udot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "udot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "ldr	%d[a0], [%[a_ptr], #64]\n"
+
+                "udot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #72]\n"
+                "udot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "udot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "ldr	%d[a1], [%[a_ptr], #80]\n"
+
+                "udot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "ins    %[a0].d[1], x20\n"
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                "udot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #88]\n"
+                "udot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "udot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #96]\n"
+
+                "udot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "ins    %[a1].d[1], x20\n"
+                "udot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #104]\n"
+                "udot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "udot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #112]\n"
+
+                "udot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "udot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #120]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "udot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "udot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+
+                "udot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "udot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "udot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "ins    %[b1].d[1], x20\n"
+                "udot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "bne	1b\n"
+
+                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz	%w[oddk], 2f\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+
+                // Detached final iteration (even K)
+                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
+
+                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ins	%[b2].d[1], x20\n"
+
+                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
+
+                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
+
+                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+
+                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
+
+                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
+
+                "udot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "udot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "udot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "ins    %[b2].d[1], x20\n"
+
+                "udot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "udot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "udot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+
+                "udot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "udot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "udot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "udot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "udot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "udot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "udot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "udot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "udot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "udot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "udot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "udot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "udot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "udot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "udot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "udot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "udot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+                "b	3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+
+                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "ins    %[b2].d[1], x20\n"
+                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+
+                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+
+                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+
+                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+
+                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+
+                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+
+                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+
+                // Common tail
+                "3:\n"
+                "str	q23, [%[c_ptr], #352]\n"
+                "str	q31, [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+
+
+
+                ".purgem udot\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+            );
+        }
+    }
+}
+#endif
+
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h
new file mode 100644
index 000000000..718232fb0
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
+#define _DECLARE_UDOT ".altmacro\n"\
+    ".macro udot opd:req, opn:req, opm:req\n"\
+    "local vd, vn, vm, h, l\n"\
+    ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n"\
+    ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"\
+    ".set vd,\\reg\n"\
+    ".endif\n"\
+    ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"\
+    ".set vn,\\reg\n"\
+    ".endif\n"\
+    ".irp idx,0,1,2,3\n"\
+    ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"\
+    ".set vm,\\reg\n"\
+    ".set h,\\idx / 2\n"\
+    ".set l,\\idx %% 2\n"\
+    ".endif\n"\
+    ".endr\n"\
+    ".endr\n"\
+    ".ifndef vd\n"\
+    ".error \"Bad operand \\opd\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".ifndef vn\n"\
+    ".error \"Bad operand \\opn\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".ifndef vm\n"\
+    ".error \"Bad operand \\opm\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".ifndef h\n"\
+    ".error \"Bad operand \\opm\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".ifndef l\n"\
+    ".error \"Bad operand \\opm\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".int	 0x6f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"\
+    ".endm\n"\
+
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp
new file mode 100644
index 000000000..3531eb6d2
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+#include "dot_toolchain_support.h"
+#include <cassert>
+
+inline void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+    assert(Apanel);
+    assert(Bpanel);
+    assert(Cpanel);
+    const uint8_t *a_ptr = Apanel;
+    uint32_t *c_ptr = Cpanel;
+    // We divide K by 4 because the udot instruction processes 4 elements at a time.
+    const int W = K/4;
+    // Fix up for odd lengths - set a flag if K is odd, but make
+    // sure we round up the iteration count.
+    const int oddk = (W & 1);
+    const int init_value_k = ((W+1)/2) - 1;
+    for (int yb=0; yb<ablocks; yb++) {
+        const uint8_t *a_ptr0 = a_ptr;
+        const uint8_t *b_ptr = Bpanel;
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            int k = init_value_k;
+            register uint32x4_t a0  asm("v0");
+            register uint32x4_t a1  asm("v1");
+            register uint32x4_t b0  asm("v2");
+            register uint32x4_t b1  asm("v3");
+            register uint32x4_t b2  asm("v4");
+            register uint32x4_t a0a asm("v5");
+            register uint32x4_t a1a asm("v6");
+            __asm __volatile (
+                _DECLARE_UDOT
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi	v8.4s, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.4s, #0x0\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "movi	v11.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi	v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz	%w[k], 4f\n"
+
+                // Loop proper
+                "1:\n"
+                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "udot  	v9.4s , %[b0].16b, %[a0].4b[1]\n"
+
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr	%q[a0a], [%[a_ptr], #32]\n"
+                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "udot	v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr	%q[a1a], [%[a_ptr], #48]\n"
+                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
+
+                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
+
+                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
+
+                "udot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "udot	v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr	%q[a0], [%[a_ptr], #64]\n"
+                "udot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "udot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "udot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "ldr	%q[a1], [%[a_ptr], #80]\n"
+                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "udot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "udot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #96]\n"
+
+                "udot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "udot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                "udot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "udot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "udot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "udot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "udot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "udot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #112]\n"
+
+                "udot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "udot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "udot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "udot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "udot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "udot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "udot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "udot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "bne	1b\n"
+
+                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz	%w[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr	%q[a0a], [%[a_ptr], #32]\n"
+                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr	%q[a1a], [%[a_ptr], #48]\n"
+                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
+
+                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
+
+                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
+
+                "udot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+
+                "udot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "udot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "udot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "udot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "str	q24, [%[c_ptr], #32]\n"
+
+                "udot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                "udot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "udot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "udot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+
+                "udot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "udot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "udot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+
+                "udot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "udot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "udot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+
+                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "udot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "udot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+
+                "udot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "udot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "udot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+
+                "udot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "udot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "udot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+                "b	3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+
+                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+
+                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+
+                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+
+                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+
+                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+
+                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+
+                // Common tail
+                "3:\n"
+                "str	q23, [%[c_ptr], #352]\n"
+                "str	q31, [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+
+                ".purgem udot\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+            );
+ 
+        }
+    }
+
+
+}
+#endif 
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp
new file mode 100644
index 000000000..3561bfec9
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+// Load the actual kernel
+#include "a64_gemm_u8_4x4/generic.hpp"
+
+class gemm_u8_4x4 {
+public:
+    typedef uint8_t operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int A_interleave = 4;
+    static const int A_block = 16;
+    static const bool A_transpose = false;
+
+    /* Same for B input */
+    static const int B_interleave = 4;
+    static const int B_block = 16;
+    static const bool B_transpose = true;
+
+    /* Kernel blocking parameters */
+    static const int out_width = 4;
+    static const int out_height = 4;
+    static const int k_unroll = 16;
+
+    kern_type kernel = nullptr;
+
+    gemm_u8_4x4(const CPUInfo *ci) {
+        kernel = a64_gemm_u8_4x4;
+    }
+};
+
+#endif // __aarch64__
+
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp
new file mode 100644
index 000000000..e48c373f2
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+inline void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+    const uint8_t *a_ptr = Apanel;
+    uint32_t *c_ptr = Cpanel;
+    K /= 16;
+    int oddk = (K & 1);
+
+    for (int yb=0; yb<ablocks; yb++) {
+        const uint8_t *a_ptr0 = a_ptr;
+        const uint8_t *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+
+            int k = ((K+1)/2)-1;
+
+            register uint8x16_t b0  asm("v4");
+            register uint8x16_t b1  asm("v5");
+            register uint8x16_t b2  asm("v6");
+            register uint8x16_t b3  asm("v7");
+            register uint8x16_t b0a asm("v8");
+            register uint8x16_t b1a asm("v9");
+            register uint8x16_t b2a asm("v10");
+            register uint8x16_t b3a asm("v11");
+
+            __asm __volatile (
+                "movi	v16.4s, #0x0\n"
+                "ldr	q0, [%[a_ptr]]\n"
+                "movi	v17.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v18.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v19.4s, #0x0\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "movi	v20.4s, #0x0\n"
+                "ldr	%q[b3], [%[b_ptr], #48]\n"
+                "movi	v21.4s, #0x0\n"
+                "ldr	q1, [%[a_ptr], #16]\n"
+                "movi	v22.4s, #0x0\n"
+                "ldr	q2, [%[a_ptr], #32]\n"
+                "movi	v23.4s, #0x0\n"
+                "ldr	q3, [%[a_ptr], #48]\n"
+                "movi	v24.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v25.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v26.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v27.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v28.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v29.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v30.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v31.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+
+                // Loop structure optimized for A57 (after r0).
+
+                // Unavoidably, the multiply will "dribble" if
+                // dual issued with an add.
+
+                // Minimize the effect of this by making sure
+                // there are 2 adds to run under the dribbled
+                // multiply.
+
+                // Pipeline in blocks of 8 multiplies - combine
+                // this iteration's multiplies with adds from
+                // the previous iteration.
+
+                // So the first block doesn't have any adds to
+                // do - but because all the adds are at the
+                // start of the block it's only the first couple
+                // of multiplies that need to be pulled out.
+
+                // Start of unroll 0 (first iteration)
+                "umull	v12.8h, v0.8b, %[b0].8b\n"
+                "umull	v13.8h, v0.8b, %[b1].8b\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz	%w[k], 4f\n"
+
+                // Unroll 0 continuation (branch target)
+                "1:\n"
+                "umull	v14.8h, v0.8b, %[b2].8b\n"
+                "subs	%w[k], %w[k], #1\n"
+                "umull	v15.8h, v0.8b, %[b3].8b\n"
+                "ldr	%q[b0a], [%[b_ptr], #64]\n"
+                "umlal2	v12.8h, v0.16b, %[b0].16b\n"
+                "umlal2	v13.8h, v0.16b, %[b1].16b\n"
+                "ldr	%q[b1a], [%[b_ptr], #80]\n"
+                "umlal2	v14.8h, v0.16b, %[b2].16b\n"
+                "umlal2	v15.8h, v0.16b, %[b3].16b\n"
+                "ldr 	q0, [%[a_ptr], #64]\n"
+
+                "uadalp	v16.4s, v12.8h\n"
+                "umull	v12.8h, v1.8b, %[b0].8b\n"
+                "uadalp	v17.4s, v13.8h\n"
+                "uadalp	v18.4s, v14.8h\n"
+                "umull	v13.8h, v1.8b, %[b1].8b\n"
+                "uadalp	v19.4s, v15.8h\n"
+                "umull	v14.8h, v1.8b, %[b2].8b\n"
+                "ldr	%q[b2a], [%[b_ptr], #96]\n"
+                "umull	v15.8h, v1.8b, %[b3].8b\n"
+                "umlal2	v12.8h, v1.16b, %[b0].16b\n"
+                "ldr	%q[b3a], [%[b_ptr], #112]\n"
+                "umlal2	v13.8h, v1.16b, %[b1].16b\n"
+                "add	%[b_ptr], %[b_ptr], #128\n"
+                "umlal2	v14.8h, v1.16b, %[b2].16b\n"
+                "umlal2	v15.8h, v1.16b, %[b3].16b\n"
+                "ldr 	q1, [%[a_ptr], #80]\n"
+
+                "uadalp	v20.4s, v12.8h\n"
+                "umull	v12.8h, v2.8b, %[b0].8b\n"
+                "uadalp	v21.4s, v13.8h\n"
+                "uadalp	v22.4s, v14.8h\n"
+                "umull	v13.8h, v2.8b, %[b1].8b\n"
+                "uadalp	v23.4s, v15.8h\n"
+                "umull	v14.8h, v2.8b, %[b2].8b\n"
+                "umull	v15.8h, v2.8b, %[b3].8b\n"
+                "umlal2	v12.8h, v2.16b, %[b0].16b\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "umlal2	v13.8h, v2.16b, %[b1].16b\n"
+                "umlal2	v14.8h, v2.16b, %[b2].16b\n"
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "umlal2	v15.8h, v2.16b, %[b3].16b\n"
+                "ldr 	q2, [%[a_ptr], #96]\n"
+
+                "uadalp	v24.4s, v12.8h\n"
+                "umull	v12.8h, v3.8b, %[b0].8b\n"
+                "uadalp	v25.4s, v13.8h\n"
+                "uadalp	v26.4s, v14.8h\n"
+                "umull	v13.8h, v3.8b, %[b1].8b\n"
+                "uadalp	v27.4s, v15.8h\n"
+                "umull	v14.8h, v3.8b, %[b2].8b\n"
+                "umull	v15.8h, v3.8b, %[b3].8b\n"
+                "umlal2	v12.8h, v3.16b, %[b0].16b\n"
+                "ldr 	%q[b0], [%[b_ptr], #0]\n"
+                "umlal2	v13.8h, v3.16b, %[b1].16b\n"
+                "umlal2	v14.8h, v3.16b, %[b2].16b\n"
+                "umlal2	v15.8h, v3.16b, %[b3].16b\n"
+                "ldr 	q3, [%[a_ptr], #112]\n"
+
+                // Unroll 1
+                "uadalp	v28.4s, v12.8h\n"
+                "umull	v12.8h, v0.8b, %[b0a].8b\n"
+                "uadalp	v29.4s, v13.8h\n"
+                "uadalp	v30.4s, v14.8h\n"
+                "umull	v13.8h, v0.8b, %[b1a].8b\n"
+                "uadalp	v31.4s, v15.8h\n"
+                "umull	v14.8h, v0.8b, %[b2a].8b\n"
+                "umull	v15.8h, v0.8b, %[b3a].8b\n"
+                "ldr 	%q[b1], [%[b_ptr], #16]\n"
+                "umlal2	v12.8h, v0.16b, %[b0a].16b\n"
+                "umlal2	v13.8h, v0.16b, %[b1a].16b\n"
+                "ldr 	%q[b2], [%[b_ptr], #32]\n"
+                "umlal2	v14.8h, v0.16b, %[b2a].16b\n"
+                "umlal2	v15.8h, v0.16b, %[b3a].16b\n"
+                "ldr 	q0, [%[a_ptr], #128]\n"
+
+                "uadalp	v16.4s, v12.8h\n"
+                "umull	v12.8h, v1.8b, %[b0a].8b\n"
+                "uadalp	v17.4s, v13.8h\n"
+                "uadalp	v18.4s, v14.8h\n"
+                "umull	v13.8h, v1.8b, %[b1a].8b\n"
+                "uadalp	v19.4s, v15.8h\n"
+                "add	%[a_ptr], %[a_ptr], #128\n"
+                "umull	v14.8h, v1.8b, %[b2a].8b\n"
+                "umull	v15.8h, v1.8b, %[b3a].8b\n"
+                "ldr 	%q[b3], [%[b_ptr], #48]\n"
+                "umlal2	v12.8h, v1.16b, %[b0a].16b\n"
+                "umlal2	v13.8h, v1.16b, %[b1a].16b\n"
+                "umlal2	v14.8h, v1.16b, %[b2a].16b\n"
+                "umlal2	v15.8h, v1.16b, %[b3a].16b\n"
+                "ldr 	q1, [%[a_ptr], #16]\n"
+
+                "uadalp	v20.4s, v12.8h\n"
+                "umull	v12.8h, v2.8b, %[b0a].8b\n"
+                "uadalp	v21.4s, v13.8h\n"
+                "uadalp	v22.4s, v14.8h\n"
+                "umull	v13.8h, v2.8b, %[b1a].8b\n"
+                "uadalp	v23.4s, v15.8h\n"
+                "umull	v14.8h, v2.8b, %[b2a].8b\n"
+                "umull	v15.8h, v2.8b, %[b3a].8b\n"
+                "umlal2	v12.8h, v2.16b, %[b0a].16b\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "umlal2	v13.8h, v2.16b, %[b1a].16b\n"
+                "umlal2	v14.8h, v2.16b, %[b2a].16b\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "umlal2	v15.8h, v2.16b, %[b3a].16b\n"
+                "ldr 	q2, [%[a_ptr], #32]\n"
+
+                "uadalp	v24.4s, v12.8h\n"
+                "umull	v12.8h, v3.8b, %[b0a].8b\n"
+                "uadalp	v25.4s, v13.8h\n"
+                "uadalp	v26.4s, v14.8h\n"
+                "umull	v13.8h, v3.8b, %[b1a].8b\n"
+                "uadalp	v27.4s, v15.8h\n"
+                "umull	v14.8h, v3.8b, %[b2a].8b\n"
+                "umull	v15.8h, v3.8b, %[b3a].8b\n"
+                "umlal2	v12.8h, v3.16b, %[b0a].16b\n"
+                "umlal2	v13.8h, v3.16b, %[b1a].16b\n"
+                "umlal2	v14.8h, v3.16b, %[b2a].16b\n"
+                "umlal2	v15.8h, v3.16b, %[b3a].16b\n"
+                "ldr 	q3, [%[a_ptr], #48]\n"
+
+                // Start of unroll 0 for next iteration.
+                "uadalp	v28.4s, v12.8h\n"
+                "umull	v12.8h, v0.8b, %[b0].8b\n"
+                "uadalp	v29.4s, v13.8h\n"
+                "uadalp	v30.4s, v14.8h\n"
+                "umull	v13.8h, v0.8b, %[b1].8b\n"
+                "uadalp	v31.4s, v15.8h\n"
+                "bne	1b\n"
+
+                // Target to use when K=1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz	%w[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                "umull	v14.8h, v0.8b, %[b2].8b\n"
+                "umull	v15.8h, v0.8b, %[b3].8b\n"
+                "ldr	%q[b0a], [%[b_ptr], #64]\n"
+                "umlal2	v12.8h, v0.16b, %[b0].16b\n"
+                "umlal2	v13.8h, v0.16b, %[b1].16b\n"
+                "ldr	%q[b1a], [%[b_ptr], #80]\n"
+                "umlal2	v14.8h, v0.16b, %[b2].16b\n"
+                "umlal2	v15.8h, v0.16b, %[b3].16b\n"
+                "ldr 	q0, [%[a_ptr], #64]\n"
+
+                "uadalp	v16.4s, v12.8h\n"
+                "umull	v12.8h, v1.8b, %[b0].8b\n"
+                "uadalp	v17.4s, v13.8h\n"
+                "uadalp	v18.4s, v14.8h\n"
+                "umull	v13.8h, v1.8b, %[b1].8b\n"
+                "uadalp	v19.4s, v15.8h\n"
+                "umull	v14.8h, v1.8b, %[b2].8b\n"
+                "ldr	%q[b2a], [%[b_ptr], #96]\n"
+                "umull	v15.8h, v1.8b, %[b3].8b\n"
+                "umlal2	v12.8h, v1.16b, %[b0].16b\n"
+                "ldr	%q[b3a], [%[b_ptr], #112]\n"
+                "umlal2	v13.8h, v1.16b, %[b1].16b\n"
+                "add	%[b_ptr], %[b_ptr], #128\n"
+                "umlal2	v14.8h, v1.16b, %[b2].16b\n"
+                "umlal2	v15.8h, v1.16b, %[b3].16b\n"
+                "ldr 	q1, [%[a_ptr], #80]\n"
+
+                "uadalp	v20.4s, v12.8h\n"
+                "umull	v12.8h, v2.8b, %[b0].8b\n"
+                "uadalp	v21.4s, v13.8h\n"
+                "uadalp	v22.4s, v14.8h\n"
+                "umull	v13.8h, v2.8b, %[b1].8b\n"
+                "uadalp	v23.4s, v15.8h\n"
+                "umull	v14.8h, v2.8b, %[b2].8b\n"
+                "umull	v15.8h, v2.8b, %[b3].8b\n"
+                "umlal2	v12.8h, v2.16b, %[b0].16b\n"
+                "umlal2	v13.8h, v2.16b, %[b1].16b\n"
+                "umlal2	v14.8h, v2.16b, %[b2].16b\n"
+                "umlal2	v15.8h, v2.16b, %[b3].16b\n"
+                "ldr 	q2, [%[a_ptr], #96]\n"
+
+                "uadalp	v24.4s, v12.8h\n"
+                "umull	v12.8h, v3.8b, %[b0].8b\n"
+                "uadalp	v25.4s, v13.8h\n"
+                "uadalp	v26.4s, v14.8h\n"
+                "umull	v13.8h, v3.8b, %[b1].8b\n"
+                "uadalp	v27.4s, v15.8h\n"
+                "umull	v14.8h, v3.8b, %[b2].8b\n"
+                "umull	v15.8h, v3.8b, %[b3].8b\n"
+                "umlal2	v12.8h, v3.16b, %[b0].16b\n"
+                "umlal2	v13.8h, v3.16b, %[b1].16b\n"
+                "umlal2	v14.8h, v3.16b, %[b2].16b\n"
+                "umlal2	v15.8h, v3.16b, %[b3].16b\n"
+                "ldr 	q3, [%[a_ptr], #112]\n"
+
+                // Unroll 1
+                "uadalp	v28.4s, v12.8h\n"
+                "umull	v12.8h, v0.8b, %[b0a].8b\n"
+                "uadalp	v29.4s, v13.8h\n"
+                "uadalp	v30.4s, v14.8h\n"
+                "umull	v13.8h, v0.8b, %[b1a].8b\n"
+                "uadalp	v31.4s, v15.8h\n"
+                "umull	v14.8h, v0.8b, %[b2a].8b\n"
+                "add	%[a_ptr], %[a_ptr], #128\n"
+                "umull	v15.8h, v0.8b, %[b3a].8b\n"
+                "umlal2	v12.8h, v0.16b, %[b0a].16b\n"
+                "umlal2	v13.8h, v0.16b, %[b1a].16b\n"
+                "umlal2	v14.8h, v0.16b, %[b2a].16b\n"
+                "umlal2	v15.8h, v0.16b, %[b3a].16b\n"
+
+                "uadalp	v16.4s, v12.8h\n"
+                "umull	v12.8h, v1.8b, %[b0a].8b\n"
+                "uadalp	v17.4s, v13.8h\n"
+                "uadalp	v18.4s, v14.8h\n"
+                "umull	v13.8h, v1.8b, %[b1a].8b\n"
+                "uadalp	v19.4s, v15.8h\n"
+                "umull	v14.8h, v1.8b, %[b2a].8b\n"
+                "umull	v15.8h, v1.8b, %[b3a].8b\n"
+                "umlal2	v12.8h, v1.16b, %[b0a].16b\n"
+                "addp	v16.4s, v16.4s, v17.4s\n"
+                "umlal2	v13.8h, v1.16b, %[b1a].16b\n"
+                "addp	v17.4s, v18.4s, v19.4s\n"
+                "umlal2	v14.8h, v1.16b, %[b2a].16b\n"
+                "umlal2	v15.8h, v1.16b, %[b3a].16b\n"
+
+                "uadalp	v20.4s, v12.8h\n"
+                "umull	v12.8h, v2.8b, %[b0a].8b\n"
+                "uadalp	v21.4s, v13.8h\n"
+                "uadalp	v22.4s, v14.8h\n"
+                "umull	v13.8h, v2.8b, %[b1a].8b\n"
+                "uadalp	v23.4s, v15.8h\n"
+                "addp	v16.4s, v16.4s, v17.4s\n"
+                "umull	v14.8h, v2.8b, %[b2a].8b\n"
+                "addp	v18.4s, v20.4s, v21.4s\n"
+                "addp	v19.4s, v22.4s, v23.4s\n"
+                "umull	v15.8h, v2.8b, %[b3a].8b\n"
+                "umlal2	v12.8h, v2.16b, %[b0a].16b\n"
+                "str	q16, [%[c_ptr]]\n"
+                "umlal2	v13.8h, v2.16b, %[b1a].16b\n"
+                "umlal2	v14.8h, v2.16b, %[b2a].16b\n"
+                "umlal2	v15.8h, v2.16b, %[b3a].16b\n"
+
+                "uadalp	v24.4s, v12.8h\n"
+                "umull	v12.8h, v3.8b, %[b0a].8b\n"
+                "uadalp	v25.4s, v13.8h\n"
+                "uadalp	v26.4s, v14.8h\n"
+                "umull	v13.8h, v3.8b, %[b1a].8b\n"
+                "uadalp	v27.4s, v15.8h\n"
+                "addp	v17.4s, v18.4s, v19.4s\n"
+                "umull	v14.8h, v3.8b, %[b2a].8b\n"
+                "addp	v20.4s, v24.4s, v25.4s\n"
+                "addp	v21.4s, v26.4s, v27.4s\n"
+                "umull	v15.8h, v3.8b, %[b3a].8b\n"
+                "umlal2	v12.8h, v3.16b, %[b0a].16b\n"
+                "str	q17, [%[c_ptr], #16]\n"
+                "umlal2	v13.8h, v3.16b, %[b1a].16b\n"
+                "umlal2	v14.8h, v3.16b, %[b2a].16b\n"
+                "addp	v18.4s, v20.4s, v21.4s\n"
+                "umlal2	v15.8h, v3.16b, %[b3a].16b\n"
+                "b	3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                "umull	v14.8h, v0.8b, %[b2].8b\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "umull	v15.8h, v0.8b, %[b3].8b\n"
+                "add	%[b_ptr], %[b_ptr], #64\n"
+                "umlal2	v12.8h, v0.16b, %[b0].16b\n"
+                "umlal2	v13.8h, v0.16b, %[b1].16b\n"
+                "umlal2	v14.8h, v0.16b, %[b2].16b\n"
+                "umlal2	v15.8h, v0.16b, %[b3].16b\n"
+
+                "uadalp	v16.4s, v12.8h\n"
+                "umull	v12.8h, v1.8b, %[b0].8b\n"
+                "uadalp	v17.4s, v13.8h\n"
+                "uadalp	v18.4s, v14.8h\n"
+                "umull	v13.8h, v1.8b, %[b1].8b\n"
+                "uadalp	v19.4s, v15.8h\n"
+                "umull	v14.8h, v1.8b, %[b2].8b\n"
+                "umull	v15.8h, v1.8b, %[b3].8b\n"
+                "umlal2	v12.8h, v1.16b, %[b0].16b\n"
+                "addp	v16.4s, v16.4s, v17.4s\n"
+                "umlal2	v13.8h, v1.16b, %[b1].16b\n"
+                "addp	v17.4s, v18.4s, v19.4s\n"
+                "umlal2	v14.8h, v1.16b, %[b2].16b\n"
+                "umlal2	v15.8h, v1.16b, %[b3].16b\n"
+
+                "uadalp	v20.4s, v12.8h\n"
+                "umull	v12.8h, v2.8b, %[b0].8b\n"
+                "uadalp	v21.4s, v13.8h\n"
+                "uadalp	v22.4s, v14.8h\n"
+                "umull	v13.8h, v2.8b, %[b1].8b\n"
+                "uadalp	v23.4s, v15.8h\n"
+                "addp	v16.4s, v16.4s, v17.4s\n"
+                "umull	v14.8h, v2.8b, %[b2].8b\n"
+                "addp	v18.4s, v20.4s, v21.4s\n"
+                "addp	v19.4s, v22.4s, v23.4s\n"
+                "umull	v15.8h, v2.8b, %[b3].8b\n"
+                "umlal2	v12.8h, v2.16b, %[b0].16b\n"
+                "str	q16, [%[c_ptr]]\n"
+                "umlal2	v13.8h, v2.16b, %[b1].16b\n"
+                "umlal2	v14.8h, v2.16b, %[b2].16b\n"
+                "umlal2	v15.8h, v2.16b, %[b3].16b\n"
+
+                "uadalp	v24.4s, v12.8h\n"
+                "umull	v12.8h, v3.8b, %[b0].8b\n"
+                "uadalp	v25.4s, v13.8h\n"
+                "uadalp	v26.4s, v14.8h\n"
+                "umull	v13.8h, v3.8b, %[b1].8b\n"
+                "uadalp	v27.4s, v15.8h\n"
+                "addp	v17.4s, v18.4s, v19.4s\n"
+                "umull	v14.8h, v3.8b, %[b2].8b\n"
+                "addp	v20.4s, v24.4s, v25.4s\n"
+                "addp	v21.4s, v26.4s, v27.4s\n"
+                "umull	v15.8h, v3.8b, %[b3].8b\n"
+                "umlal2	v12.8h, v3.16b, %[b0].16b\n"
+                "str	q17, [%[c_ptr], #16]\n"
+                "umlal2	v13.8h, v3.16b, %[b1].16b\n"
+                "umlal2	v14.8h, v3.16b, %[b2].16b\n"
+                "addp	v18.4s, v20.4s, v21.4s\n"
+                "umlal2	v15.8h, v3.16b, %[b3].16b\n"
+
+                "3:\n"
+
+                // Final additions
+                "uadalp	v28.4s, v12.8h\n"
+                "str	q18, [%[c_ptr], #32]\n"
+                "uadalp	v29.4s, v13.8h\n"
+                "uadalp	v30.4s, v14.8h\n"
+                "uadalp	v31.4s, v15.8h\n"
+
+                // Horizontal reduction, phase 1
+                "addp	v22.4s, v28.4s, v29.4s\n"
+                "addp	v23.4s, v30.4s, v31.4s\n"
+
+                // Horizontal reduction, phase 2
+                "addp	v19.4s, v22.4s, v23.4s\n"
+                "str	q19, [%[c_ptr], #48]\n"
+                "add	%[c_ptr], %[c_ptr], #64\n"
+
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3),
+              [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a), [b3a] "+w" (b3a),
+              [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19",
+              "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc");
+        }
+    }
+}
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp
new file mode 100644
index 000000000..ba6d2989c
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+// Get the components we need to implement SGEMM.
+// Can select appropriate components dependent on AArch32 vs. AArch64 etc. at build time.
+#include "a64_hgemm_24x8/generic.hpp"
+
+// 24x8 HGEMM "strategy" class.  Describes the kernel properties.
+//
+// The generic "gemm_opt" function will instantiate one of these (allowing
+// the constructor to pick a kernel implementation).
+class hgemm_24x8 {
+public:
+    typedef __fp16 operand_type;
+    typedef __fp16 result_type;
+
+    typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+
+    static const int A_block = 1;
+    static const int A_interleave = 8;
+    static const bool A_transpose = false;
+
+    static const int B_block = 1;
+    static const int B_interleave = 24;
+    static const bool B_transpose = true;
+
+    static const int out_width = 24;
+    static const int out_height = 8;
+    static const int k_unroll = 1;
+
+    kern_type kernel = nullptr;
+
+    hgemm_24x8(const struct CPUInfo *ci) {
+        kernel = a64_hgemm_asimd_24x8;
+    }
+};
+
+#endif // __aarch64__ and FP16_VECTOR_ARITHMETIC
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp
new file mode 100644
index 000000000..03e2bb95a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <arm_neon.h>
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+inline void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+    const __fp16 *a_ptr = Apanel;
+    __fp16 *c_ptr = Cpanel;
+    for (int yb=0; yb<ablocks; yb++) {
+        const __fp16 *a_ptr0 = a_ptr;
+        const __fp16 *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k = ((K+1)/2) - 1;
+            register float16x8_t a0  asm("v0");
+            register float16x8_t a0a asm("v1");
+            register float16x8_t b0  asm("v2");
+            register float16x8_t b1  asm("v3");
+            register float16x8_t b2  asm("v4");
+            register float16x8_t b0a asm("v5");
+            register float16x8_t b1a asm("v6");
+            register float16x8_t b2a asm("v7");
+
+            __asm __volatile (
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi	v8.8h, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.8h, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.8h, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v11.8h, #0x0\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "movi	v12.8h, #0x0\n"
+                "ldr	%q[b0a], [%[b_ptr], #48]\n"
+                "movi	v13.8h, #0x0\n"
+                "ldr	%q[b1a], [%[b_ptr], #64]\n"
+                "movi	v14.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v15.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v16.8h, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v17.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v18.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v19.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.8h, #0x0\n"
+                "movi	v21.8h, #0x0\n"
+                "movi	v22.8h, #0x0\n"
+                "movi	v23.8h, #0x0\n"
+                "movi	v24.8h, #0x0\n"
+                "movi	v25.8h, #0x0\n"
+                "movi	v26.8h, #0x0\n"
+                "movi	v27.8h, #0x0\n"
+                "movi	v28.8h, #0x0\n"
+                "movi	v29.8h, #0x0\n"
+                "movi	v30.8h, #0x0\n"
+                "movi	v31.8h, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz	%w[k], 4f\n"
+
+                "1:\n"
+                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "fmla  	v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "ldr	%q[a0a], [%[a_ptr], #16]\n"
+                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "ldr	%q[b2a], [%[b_ptr], #80]\n"
+                "fmla 	v12.8h, %[b0].8h, %[a0].h[4]\n"
+                "fmla	v13.8h, %[b0].8h, %[a0].h[5]\n"
+                "fmla	v14.8h, %[b0].8h, %[a0].h[6]\n"
+                "fmla	v15.8h, %[b0].8h, %[a0].h[7]\n"
+                "ldr	%q[b0], [%[b_ptr], #96]\n"
+
+                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v20.8h, %[b1].8h, %[a0].h[4]\n"
+                "fmla	v21.8h, %[b1].8h, %[a0].h[5]\n"
+                "fmla	v22.8h, %[b1].8h, %[a0].h[6]\n"
+                "fmla	v23.8h, %[b1].8h, %[a0].h[7]\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+
+                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #288]")
+                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
+                "fmla	v28.8h, %[b2].8h, %[a0].h[4]\n"
+                "fmla	v29.8h, %[b2].8h, %[a0].h[5]\n"
+                "fmla	v30.8h, %[b2].8h, %[a0].h[6]\n"
+                "fmla	v31.8h, %[b2].8h, %[a0].h[7]\n"
+                "ldr	%q[a0], [%[a_ptr], #32]\n"
+
+                "fmla 	v8.8h , %[b0a].8h, %[a0a].h[0]\n"
+                "fmla	v9.8h , %[b0a].8h, %[a0a].h[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "fmla	v10.8h, %[b0a].8h, %[a0a].h[2]\n"
+                "fmla	v11.8h, %[b0a].8h, %[a0a].h[3]\n"
+                "fmla 	v12.8h, %[b0a].8h, %[a0a].h[4]\n"
+                "fmla	v13.8h, %[b0a].8h, %[a0a].h[5]\n"
+                "fmla	v14.8h, %[b0a].8h, %[a0a].h[6]\n"
+                "fmla	v15.8h, %[b0a].8h, %[a0a].h[7]\n"
+                "ldr	%q[b0a], [%[b_ptr], #48]\n"
+
+                "fmla	v16.8h, %[b1a].8h, %[a0a].h[0]\n"
+                "fmla	v17.8h, %[b1a].8h, %[a0a].h[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #352]")
+                "fmla	v18.8h, %[b1a].8h, %[a0a].h[2]\n"
+                "fmla	v19.8h, %[b1a].8h, %[a0a].h[3]\n"
+                "fmla	v20.8h, %[b1a].8h, %[a0a].h[4]\n"
+                "fmla	v21.8h, %[b1a].8h, %[a0a].h[5]\n"
+                "fmla	v22.8h, %[b1a].8h, %[a0a].h[6]\n"
+                "fmla	v23.8h, %[b1a].8h, %[a0a].h[7]\n"
+                "ldr	%q[b1a], [%[b_ptr], #64]\n"
+
+                "fmla	v24.8h, %[b2a].8h, %[a0a].h[0]\n"
+                "fmla	v25.8h, %[b2a].8h, %[a0a].h[1]\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "fmla	v26.8h, %[b2a].8h, %[a0a].h[2]\n"
+                "fmla	v27.8h, %[b2a].8h, %[a0a].h[3]\n"
+                "fmla	v28.8h, %[b2a].8h, %[a0a].h[4]\n"
+                "fmla	v29.8h, %[b2a].8h, %[a0a].h[5]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v30.8h, %[b2a].8h, %[a0a].h[6]\n"
+                "fmla	v31.8h, %[b2a].8h, %[a0a].h[7]\n"
+
+                "bne	1b\n"
+                "4:\n"
+
+                // Jump to odd tail if necessary.
+                "cbnz	%w[oddk], 2f\n"
+
+                // Even tail.
+                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "fmla   v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "ldr	%q[a0a], [%[a_ptr], #16]\n"
+                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "ldr	%q[b2a], [%[b_ptr], #80]\n"
+                "fmla 	v12.8h, %[b0].8h, %[a0].h[4]\n"
+                "fmla   v13.8h, %[b0].8h, %[a0].h[5]\n"
+                "fmla	v14.8h, %[b0].8h, %[a0].h[6]\n"
+                "fmla	v15.8h, %[b0].8h, %[a0].h[7]\n"
+
+                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "fmla	v20.8h, %[b1].8h, %[a0].h[4]\n"
+                "fmla	v21.8h, %[b1].8h, %[a0].h[5]\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "fmla	v22.8h, %[b1].8h, %[a0].h[6]\n"
+                "fmla	v23.8h, %[b1].8h, %[a0].h[7]\n"
+
+                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
+                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
+                "fmla	v28.8h, %[b2].8h, %[a0].h[4]\n"
+                "fmla	v29.8h, %[b2].8h, %[a0].h[5]\n"
+                "fmla	v30.8h, %[b2].8h, %[a0].h[6]\n"
+                "fmla	v31.8h, %[b2].8h, %[a0].h[7]\n"
+
+                "fmla 	v8.8h , %[b0a].8h, %[a0a].h[0]\n"
+                "fmla	v16.8h, %[b1a].8h, %[a0a].h[0]\n"
+                "str	q8, [%[c_ptr]]\n"
+                "fmla	v24.8h, %[b2a].8h, %[a0a].h[0]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+
+                "fmla  	v9.8h , %[b0a].8h, %[a0a].h[1]\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "fmla	v17.8h, %[b1a].8h, %[a0a].h[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                "fmla	v25.8h, %[b2a].8h, %[a0a].h[1]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+
+                "fmla	v10.8h, %[b0a].8h, %[a0a].h[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "fmla	v18.8h, %[b1a].8h, %[a0a].h[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+                "fmla	v26.8h, %[b2a].8h, %[a0a].h[2]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+
+                "fmla	v11.8h, %[b0a].8h, %[a0a].h[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "fmla	v19.8h, %[b1a].8h, %[a0a].h[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+                "fmla	v27.8h, %[b2a].8h, %[a0a].h[3]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+
+                "fmla 	v12.8h, %[b0a].8h, %[a0a].h[4]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "fmla	v20.8h, %[b1a].8h, %[a0a].h[4]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+                "fmla	v28.8h, %[b2a].8h, %[a0a].h[4]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+
+                "fmla  	v13.8h, %[b0a].8h, %[a0a].h[5]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "fmla	v21.8h, %[b1a].8h, %[a0a].h[5]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+                "fmla	v29.8h, %[b2a].8h, %[a0a].h[5]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+
+                "fmla	v14.8h, %[b0a].8h, %[a0a].h[6]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "fmla	v22.8h, %[b1a].8h, %[a0a].h[6]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+                "fmla	v30.8h, %[b2a].8h, %[a0a].h[6]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+
+                "fmla	v15.8h, %[b0a].8h, %[a0a].h[7]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "fmla	v23.8h, %[b1a].8h, %[a0a].h[7]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+                "fmla	v31.8h, %[b2a].8h, %[a0a].h[7]\n"
+                "b	3f\n"
+
+                // Odd tail
+                "2:\n"
+                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "add	%[a_ptr], %[a_ptr], #16\n"
+                "str	q8, [%[c_ptr]]\n"
+                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+
+                "fmla  	v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+
+                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+
+                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+
+                "fmla 	v12.8h, %[b0].8h, %[a0].h[4]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "fmla	v20.8h, %[b1].8h, %[a0].h[4]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+                "fmla	v28.8h, %[b2].8h, %[a0].h[4]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+
+                "fmla  	v13.8h, %[b0].8h, %[a0].h[5]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "fmla	v21.8h, %[b1].8h, %[a0].h[5]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+                "fmla	v29.8h, %[b2].8h, %[a0].h[5]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+
+                "fmla	v14.8h, %[b0].8h, %[a0].h[6]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "fmla	v22.8h, %[b1].8h, %[a0].h[6]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+                "fmla	v30.8h, %[b2].8h, %[a0].h[6]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+
+                "fmla	v15.8h, %[b0].8h, %[a0].h[7]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "fmla	v23.8h, %[b1].8h, %[a0].h[7]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+                "fmla	v31.8h, %[b2].8h, %[a0].h[7]\n"
+
+                "3:\n"
+                "str	q23, [%[c_ptr], #352]\n"
+                "str	q31, [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a0a] "+w" (a0a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k),
+              [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+            );
+        }
+    }
+}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp
index e229e215e..603ad8dc0 100644
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp
@@ -28,6 +28,9 @@
 // Actual kernel implementations
 #include "a64_sgemm_12x8/generic.hpp"
 #include "a64_sgemm_12x8/a53.hpp"
+#include "a64_sgemm_12x8/a55.hpp"
+#include "a64_sgemm_12x8/a55r1.hpp"
+
 
 // 12x8 SGEMM "strategy" class.
 //
@@ -66,6 +69,12 @@ public:
         if (ci->CPU == CPUTarget::A53) {
             kernel = a64_sgemm_asimd_12x8_a53;
         }
+        else if (ci->CPU == CPUTarget::A55) {
+            kernel = a64_sgemm_asimd_12x8_a55;
+        }
+        else if (ci->CPU == CPUTarget::A55_DOT) {
+            kernel = a64_sgemm_asimd_12x8_a55r1;
+        }
     }
 };
 
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp
index e58ce6682..1c9b4b38f 100644
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp
@@ -206,7 +206,7 @@ inline void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, f
 
                 // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
                 "4:\n"
-                "cbnz	%[oddk], 2f\n"
+                "cbnz	%w[oddk], 2f\n"
 
                 // Detached final iteration. (even K)
                 "ldr	%d[b2], [%[b_ptr], #32]\n"
@@ -360,8 +360,9 @@ inline void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, f
               [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
             : [oddk] "r" (oddk)
             : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
             );
         }
     }
 }
+                
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp
new file mode 100644
index 000000000..85d8a502f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+inline void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+    const float *a_ptr = Apanel;
+    float *c_ptr = Cpanel;
+
+    for (int yb=0; yb<ablocks; yb++) {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k = ((K+1)/2) - 1;
+
+            register float32x4_t a0  asm("v0");
+            register float32x4_t a1  asm("v1");
+            register float32x4_t b0  asm("v2");
+            register float32x4_t b1  asm("v3");
+            register float32x4_t b2  asm("v4");
+            register float32x4_t a0a asm("v5");
+            register float32x4_t a1a asm("v6");
+
+            __asm __volatile (
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi	v8.4s, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.4s, #0x0\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "movi	v11.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi	v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz	%w[k], 4f\n"
+
+                "1:\n"
+                // Unroll 0
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
+
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "subs	%w[k], %w[k], #1\n"
+
+
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
+                "ins	%[b2].d[1], x20\n"
+
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[a_ptr], #40]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
+                "ins	%[a0a].d[1], x20\n"
+
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[a_ptr], #56]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
+                "ins	%[a1a].d[1], x20\n"
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[b_ptr], #56]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
+                "ins	%[b0].d[1], x20\n"
+
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #72]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
+
+
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Unroll 1
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
+                "ins	%[b1].d[1], x20\n"
+
+                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #88]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+
+                "ldr	%d[a0], [%[a_ptr], #64]\n"
+                "ins	%[b2].d[1], x20\n"
+
+                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "ldr	x20, [%[a_ptr], #72]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+
+                "ldr	%d[a1], [%[a_ptr], #80]\n"
+                "ins	%[a0].d[1], x20\n"
+
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[a_ptr], #88]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+
+
+                "ldr	%d[b0], [%[b_ptr], #96]\n"
+                "ins	%[a1].d[1], x20\n"
+
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #104]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+
+                "ldr	%d[b1], [%[b_ptr], #112]\n"
+                "ins	%[b0].d[1], x20\n"
+
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #120]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+
+
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+                "ins	%[b1].d[1], x20\n"
+
+
+                "bne	1b\n"
+
+                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
+                "4:\n"
+                "cbnz	%w[oddk], 2f\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
+
+                // Detached final iteration. (even K)
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
+                "ins	%[b2].d[1], x20\n"
+
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[a_ptr], #40]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
+                "ins	%[a0a].d[1], x20\n"
+
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[a_ptr], #56]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
+                "ins	%[a1a].d[1], x20\n"
+
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[b_ptr], #56]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
+                "ins	%[b0].d[1], x20\n"
+
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[b_ptr], #72]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
+                "ins	%[b1].d[1], x20\n"
+
+                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #88]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "b	3f\n"
+
+                // Detached final iteration. (odd K)
+                "2:\n"
+
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
+
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Common tail
+                "3:\n"
+                "str	q8,  [%[c_ptr]]\n"
+                "str	q16,  [%[c_ptr], #16]\n"
+                "str	q24,  [%[c_ptr], #32]\n"
+                "str	q9,  [%[c_ptr], #48]\n"
+                "str	q17,  [%[c_ptr], #64]\n"
+                "str	q25,  [%[c_ptr], #80]\n"
+                "str	q10,  [%[c_ptr], #96]\n"
+                "str	q18,  [%[c_ptr], #112]\n"
+                "str	q26,  [%[c_ptr], #128]\n"
+                "str	q11,  [%[c_ptr], #144]\n"
+                "str	q19,  [%[c_ptr], #160]\n"
+                "str	q27,  [%[c_ptr], #176]\n"
+                "str	q12,  [%[c_ptr], #192]\n"
+                "str	q20,  [%[c_ptr], #208]\n"
+                "str	q28,  [%[c_ptr], #224]\n"
+                "str	q13,  [%[c_ptr], #240]\n"
+                "str	q21,  [%[c_ptr], #256]\n"
+                "str	q29,  [%[c_ptr], #272]\n"
+                "str	q14,  [%[c_ptr], #288]\n"
+                "str	q22,  [%[c_ptr], #304]\n"
+                "str	q30,  [%[c_ptr], #320]\n"
+                "str	q15,  [%[c_ptr], #336]\n"
+                "str	q23,  [%[c_ptr], #352]\n"
+                "str	q31,  [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+            );
+        }
+    }
+}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp
new file mode 100644
index 000000000..295308053
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+inline void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+    const float *a_ptr = Apanel;
+    float *c_ptr = Cpanel;
+    for (int yb=0; yb<ablocks; yb++) {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr = Bpanel;
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k = ((K+1)/2) - 1;
+
+            register float32x4_t a0  asm("v0");
+            register float32x4_t a1  asm("v1");
+            register float32x4_t b0  asm("v2");
+            register float32x4_t b1  asm("v3");
+            register float32x4_t b2  asm("v4");
+            register float32x4_t a0a asm("v5");
+            register float32x4_t a1a asm("v6");
+
+            __asm __volatile (
+                // Initialize result registers, load initial operands, prime prefetches.
+                "ldp	%q[a0], %q[a1], [%[a_ptr]]\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "ldp	%q[b0], %q[b1], [%[b_ptr]]\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                ASM_PREFETCH("[%[b_ptr], #256]")
+
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                ASM_PREFETCH("[%[a_ptr], #384]")
+
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                ASM_PREFETCH("[%[b_ptr], #512]")
+
+                "movi	v8.4s, #0x0\n"
+                "movi	v9.4s, #0x0\n"
+                "movi	v10.4s, #0x0\n"
+                "movi	v11.4s, #0x0\n"
+                "movi	v12.4s, #0x0\n"
+                "movi	v13.4s, #0x0\n"
+                "movi	v14.4s, #0x0\n"
+                "movi	v15.4s, #0x0\n"
+                "movi	v16.4s, #0x0\n"
+                "movi	v17.4s, #0x0\n"
+
+                "movi	v18.4s, #0x0\n"
+                "movi	v19.4s, #0x0\n"
+                "movi	v20.4s, #0x0\n"
+                "movi	v21.4s, #0x0\n"
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz	%w[k], 4f\n"
+
+                "1:\n"
+                // Unroll 0
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
+
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[a_ptr], #40]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
+
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "ins	%[a0a].d[1], x20\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[a_ptr], #56]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
+
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "ins	%[a1a].d[1], x20\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[b_ptr], #56]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
+
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #72]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                ASM_PREFETCH("[%[a_ptr], #448]")
+
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #576]")
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Unroll 1
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
+
+                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #88]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "ldr	%d[a0], [%[a_ptr], #64]\n"
+
+                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "ldr	x20, [%[a_ptr], #72]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "ldr	%d[a1], [%[a_ptr], #80]\n"
+
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "ins	%[a0].d[1], x20\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[a_ptr], #88]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #96]\n"
+
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "ins	%[a1].d[1], x20\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #104]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #112]\n"
+
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #120]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                ASM_PREFETCH("[%[b_ptr], #640]")
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+
+
+                "bne	1b\n"
+
+                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
+                "4:\n"
+                "cbnz	%w[oddk], 2f\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
+
+                // Detached final iteration. (even K)
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
+
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[a_ptr], #40]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
+
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "ins	%[a0a].d[1], x20\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[a_ptr], #56]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
+
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "ins	%[a1a].d[1], x20\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[b_ptr], #56]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
+
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[b_ptr], #72]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                 
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
+
+                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #88]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "ins	%[b2].d[1], x20\n"
+
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "b	3f\n"
+
+                // Detached final iteration. (odd K)
+                "2:\n"
+
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Common tail
+                "3:\n"
+                "str	q8,  [%[c_ptr]]\n"
+                "str	q16,  [%[c_ptr], #16]\n"
+                "str	q24,  [%[c_ptr], #32]\n"
+                "str	q9,  [%[c_ptr], #48]\n"
+                "str	q17,  [%[c_ptr], #64]\n"
+                "str	q25,  [%[c_ptr], #80]\n"
+                "str	q10,  [%[c_ptr], #96]\n"
+                "str	q18,  [%[c_ptr], #112]\n"
+                "str	q26,  [%[c_ptr], #128]\n"
+                "str	q11,  [%[c_ptr], #144]\n"
+                "str	q19,  [%[c_ptr], #160]\n"
+                "str	q27,  [%[c_ptr], #176]\n"
+                "str	q12,  [%[c_ptr], #192]\n"
+                "str	q20,  [%[c_ptr], #208]\n"
+                "str	q28,  [%[c_ptr], #224]\n"
+                "str	q13,  [%[c_ptr], #240]\n"
+                "str	q21,  [%[c_ptr], #256]\n"
+                "str	q29,  [%[c_ptr], #272]\n"
+                "str	q14,  [%[c_ptr], #288]\n"
+                "str	q22,  [%[c_ptr], #304]\n"
+                "str	q30,  [%[c_ptr], #320]\n"
+                "str	q15,  [%[c_ptr], #336]\n"
+                "str	q23,  [%[c_ptr], #352]\n"
+                "str	q31,  [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+            );
+        }
+    }
+}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp
index 082c20064..c4a5875a3 100644
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp
@@ -181,7 +181,7 @@ inline void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel,
                 "4:\n"
 
                 // Branch to alternative tail for odd K
-                "cbnz	%[oddk], 2f\n"
+                "cbnz	%w[oddk], 2f\n"
 
                 // Detached final iteration (even K)
                 "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
@@ -347,7 +347,7 @@ inline void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel,
               [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
             : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump)
             : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
             );
         }
     }
diff --git a/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp
index f2c5fd86b..e8edddb4f 100644
--- a/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp
@@ -226,7 +226,7 @@ inline void MergeResults<12, 8>(float *out, const float *in, const int ldout, co
                   [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
                   [inptr] "+r" (inptr)
                 : [av] "w" (av), [bv] "w" (bv)
-                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q16", "q17", "q18", "q19", "q20", "q21"
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21"
                 );
             }
         }
diff --git a/arm_compute/core/NEON/kernels/assembly/profiler.hpp b/arm_compute/core/NEON/kernels/assembly/profiler.hpp
index d2f8ba923..f7a1d1c70 100644
--- a/arm_compute/core/NEON/kernels/assembly/profiler.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/profiler.hpp
@@ -31,6 +31,7 @@ class profiler {
 private:
     static const int maxevents = 10000;
     unsigned long times[maxevents];
+    unsigned long units[maxevents];
     int events[maxevents];
     int currentevent;
     int countfd;
@@ -45,35 +46,38 @@ public:
         close(countfd);
         int tots[5];
         unsigned long counts[5];
+        unsigned long tunits[5];
         const char * descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" };
 
         for (int i=1; i<5; i++) {
             tots[i] = 0;
             counts[i] = 0;
+            tunits[i] = 0;
         }
 
         printf("Profiled events:\n");
         for (int i=0; i<currentevent; i++) {
-            printf("%10s: %ld\n", descs[events[i]-1], times[i]);
             tots[events[i]]++;
             counts[events[i]] += times[i];
+            tunits[events[i]] += units[i];
         }
 
-        printf("%20s  %9s %9s %9s\n", "", "Events", "Total", "Average");
+        printf("%20s  %9s %9s %9s %12s %9s\n", "", "Events", "Total", "Average", "Bytes/MACs", "Per cycle");
         for (int i=1; i<5; i++) {
-            printf("%20s: %9d %9ld %9ld\n",descs[i-1],tots[i],counts[i],counts[i]/tots[i]);
+            printf("%20s: %9d %9ld %9ld %12lu %9.2f\n",descs[i-1],tots[i],counts[i],counts[i]/tots[i],tunits[i],(float)tunits[i]/counts[i]);
         }
     }
 
     template <typename T>
-    void operator() (int i, T func) {
+    void operator() (int i, unsigned long u, T func) {
         if (currentevent==maxevents) {
             func();
         } else {
+            events[currentevent] = i;
+            units[currentevent] = u;
             start_counter(countfd);
             func();
             long long cycs = stop_counter(countfd);
-            events[currentevent] = i;
             times[currentevent++] = cycs;
         }
     }
@@ -84,7 +88,7 @@ public:
 class profiler {
 public:
     template <typename T>
-    void operator() (int i, T func) {
+    void operator() (int i, unsigned long u, T func) {
         func();
     }
 };
@@ -95,3 +99,5 @@ public:
 #define PROFILE_PREPB 2
 #define PROFILE_KERNEL 3
 #define PROFILE_MERGE 4
+
+
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp
new file mode 100644
index 000000000..0c23cebe6
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+#include "../asmlib.hpp"
+
+
+template<>
+template<typename T>
+void TransformImpl<8, 1, false, 2, 2>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
+    uint16_t *outptr = (uint16_t *)out;
+    const uint16_t *inptr = (const uint16_t *)in;
+
+    uint16_t zerobuff[24];
+
+    for (int y=y0; y<ymax; y+=8) {
+        const uint16_t *inptr0 = inptr + y * ldin + k0;
+        const uint16_t *inptr1 = inptr0 + ldin;
+        const uint16_t *inptr2 = inptr1 + ldin;
+        const uint16_t *inptr3 = inptr2 + ldin;
+        const uint16_t *inptr4 = inptr3 + ldin;
+        const uint16_t *inptr5 = inptr4 + ldin;
+        const uint16_t *inptr6 = inptr5 + ldin;
+        const uint16_t *inptr7 = inptr6 + ldin;
+
+        prefetch_2x(inptr0);
+        prefetch_2x(inptr1);
+        prefetch_2x(inptr2);
+        prefetch_2x(inptr3);
+        prefetch_2x(inptr4);
+        prefetch_2x(inptr5);
+        prefetch_2x(inptr6);
+        prefetch_2x(inptr7);
+
+        int x=(kmax-k0);
+        for (;x>7;x-=8) {
+            /* Cope with ragged cases by copying from a buffer of zeroes instead */
+            if ((y + 7) >= ymax) {
+                switch ((y + 7) - ymax) {
+                    /* Everything falls through in here */
+                    case 6:
+                        inptr1 = zerobuff;
+                    case 5:
+                        inptr2 = zerobuff;
+                    case 4:
+                        inptr3 = zerobuff;
+                    case 3:
+                        inptr4 = zerobuff;
+                    case 2:
+                        inptr5 = zerobuff;
+                    case 1:
+                        inptr6 = zerobuff;
+                    case 0:
+                        inptr7 = zerobuff;
+                    default:
+                        break;
+                }
+            }
+
+            int skippf = (x & 31);
+            __asm __volatile (
+                // Load up 8 elements (1 vector) from each of 8 sources.
+                "CBNZ	%w[skippf], 1f\n"
+                ASM_PREFETCH("[%[inptr0], #128]")
+                ASM_PREFETCH("[%[inptr1], #128]")
+                ASM_PREFETCH("[%[inptr2], #128]")
+                ASM_PREFETCH("[%[inptr3], #128]")
+                "1:\n"
+
+                "LDR	q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
+                "LDR	q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
+                "LDR	q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
+                "LDR	q6, [%[inptr6]], #16\n"
+                "ZIP1	v8.8h, v0.8h, v4.8h\n"  // q8=A0E0A1E1A2E2A3E3
+                "ZIP2	v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
+                "ZIP1	v9.8h, v2.8h, v6.8h\n"  // q9=C0G0C1G1C2G2C3G3
+                "ZIP2	v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
+                "LDR	q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7
+                "LDR	q5, [%[inptr5]], #16\n"
+                "LDR	q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
+                "LDR	q7, [%[inptr7]], #16\n"
+                "ZIP1	v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
+                "ZIP2	v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
+                "ZIP1	v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
+                "ZIP2	v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
+
+                "ZIP1	v12.8h,  v8.8h,  v9.8h\n" // q20=A0C0E0G0A1C1E1G1
+                "ZIP2	v20.8h,  v8.8h,  v9.8h\n"
+                "ZIP1	v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
+                "ZIP2	v21.8h, v10.8h, v11.8h\n"
+
+                "CBNZ	%w[skippf], 2f\n"
+                ASM_PREFETCH("[%[inptr4], #112]")
+                ASM_PREFETCH("[%[inptr5], #112]")
+                ASM_PREFETCH("[%[inptr6], #112]")
+                ASM_PREFETCH("[%[inptr7], #112]")
+                "2:\n"
+
+                "ZIP1	v22.8h, v16.8h, v17.8h\n"
+                "ZIP2	v30.8h, v16.8h, v17.8h\n"
+                "ZIP1	v23.8h, v18.8h, v19.8h\n"
+                "ZIP2	v31.8h, v18.8h, v19.8h\n"
+
+                "ZIP1	v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0
+                "ZIP2	v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1
+                "STP	q14, q15, [%[outptr]], #32\n" // Write back first two elements
+
+                "ZIP1	v0.8h, v20.8h, v21.8h\n"
+                "ZIP2	v1.8h, v20.8h, v21.8h\n"
+                "STP	q0, q1, [%[outptr]], #32\n" // Write back next two elements
+
+                "ZIP1	v2.8h, v22.8h, v23.8h\n"
+                "ZIP2	v3.8h, v22.8h, v23.8h\n"
+                "STP	q2, q3, [%[outptr]], #32\n" // Write back next two elements
+
+                "ZIP1	v4.8h, v30.8h, v31.8h\n"
+                "ZIP2	v5.8h, v30.8h, v31.8h\n"
+                "STP	q4, q5, [%[outptr]], #32\n" // Write back last two elements
+                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
+                : [skippf] "r" (skippf)
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+                  "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+            );
+        }
+
+        for (;x>0;x--) {
+            *outptr++ = *inptr0++;
+            *outptr++ = *inptr1++;
+            *outptr++ = *inptr2++;
+            *outptr++ = *inptr3++;
+            *outptr++ = *inptr4++;
+            *outptr++ = *inptr5++;
+            *outptr++ = *inptr6++;
+            *outptr++ = *inptr7++;
+        }
+    }
+}
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp
new file mode 100644
index 000000000..e440e3288
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "transpose_interleave_common.hpp"
+
+// Generic unblocked transposed 12x32-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<12, 1, true, 4, 4>::Transform(
+    T* out, const T* const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax
+) {
+  // Redirect to a 24 x uint16_t specialisation
+  TransformImpl<24, 1, true, 2, 2>::Transform(
+    reinterpret_cast<uint16_t *>(out),
+    reinterpret_cast<const uint16_t * const>(in),
+    stride*2, x0*2, xmax*2, k0, kmax
+  );
+}
+
+// Generic 24x16-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<24, 1, true, 2, 2>::Transform(
+    T* out, const T* const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax
+) {
+  // Redirect to a uint16_t specialisation
+  Transform(
+    reinterpret_cast<uint16_t *>(out),
+    reinterpret_cast<const uint16_t * const>(in),
+    stride, x0, xmax, k0, kmax
+  );
+}
+
+// Specialised 24 x uint16_t version
+template <>
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
+    __asm __volatile (
+        "LDP    q0, q1, [%[in0]], #32\n"
+        "STP    q0, q1, [%[out]]\n"
+        ASM_PREFETCH("[%[in0], #192]")
+        "LDR    q2, [%[in0]], #16\n"
+        "STR    q2, [%[out], #32]\n"
+    : [in0] "+r" (in0), [out] "+r" (out)
+    :
+    : "v0", "v1", "v2", "memory"
+    );
+}
+
+template <>
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1,uint16_t *out) {
+    __asm __volatile (
+        "LDP    q0, q1, [%[in0]], #32\n"
+        "STP    q0, q1, [%[out]]\n"
+        ASM_PREFETCH("[%[in0], #192]")
+        "LDR    q2, [%[in0]], #16\n"
+        "LDP	q3, q4, [%[in1]], #32\n"
+        "STP    q2, q3, [%[out], #32]\n"
+        ASM_PREFETCH("[%[in1], #192]")
+        "LDR	q5, [%[in1]], #16\n"
+        "STP    q4, q5, [%[out], #64]\n"
+    : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out)
+    :
+    : "v0", "v1", "v2", "v3", "v4", "v5", "memory"
+    );
+}
+
+template <>
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
+    __asm __volatile (
+        "LDP    q0, q1, [%[in0]], #32\n"
+        "STP    q0, q1, [%[out]]\n"
+        "LDR    q2, [%[in0]], #16\n"
+        ASM_PREFETCH("[%[in0], #192]")
+        "LDP	q3, q4, [%[in1]], #32\n"
+        "STP    q2, q3, [%[out], #32]\n"
+        "LDR	q5, [%[in1]], #16\n"
+        ASM_PREFETCH("[%[in1], #192]")
+        "STP    q4, q5, [%[out], #64]\n"
+        "LDP	q6, q7, [%[in2]], #32\n"
+        "STP    q6, q7, [%[out], #96]\n"
+        "LDR	q8, [%[in2]], #16\n"
+        ASM_PREFETCH("[%[in2], #192]")
+        "LDP	q9, q10, [%[in3]], #32\n"
+        "STP    q8, q9, [%[out], #128]\n"
+        "LDR	q11, [%[in3]], #16\n"
+        "STP    q10, q11, [%[out], #160]\n"
+        ASM_PREFETCH("[%[in3], #192]")
+
+    : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out)
+    :
+    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"
+    );
+}
+
+template <>
+template <>
+inline void TransformImpl<24, 1, true, 2, 2>::Transform(
+    uint16_t* out, const uint16_t* const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax
+) {
+  TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+}
+
+#endif  // __arch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp
index 13e1b5468..8a2213f7f 100644
--- a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp
@@ -23,10 +23,10 @@
  */
 #include "a32_interleave_6way_32bit.hpp"
 #include "a32_transpose_interleave_8way_32bit.hpp"
-//#include "a64_interleave_8way_16bit.hpp"
+#include "a64_interleave_8way_16bit.hpp"
 #include "a64_interleave_8way_32bit.hpp"
 //#include "a64_interleave_8way_half_to_float.hpp"
 //#include "a64_transpose_interleave_12way_16bit.hpp"
 //#include "a64_transpose_interleave_12way_half_to_float.hpp"
-//#include "a64_transpose_interleave_24way_16bit.hpp"
+#include "a64_transpose_interleave_24way_16bit.hpp"
 #include "transpose_interleave_common.hpp"
diff --git a/arm_compute/core/NEON/kernels/convolution/NEDirectConvolution3x3.h b/arm_compute/core/NEON/kernels/convolution/NEDirectConvolution3x3.h
new file mode 100644
index 000000000..7f39e5ee8
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/convolution/NEDirectConvolution3x3.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H__
+#define __ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace detail
+{
+inline float32x4x3_t load_matrix_row(const float *ptr)
+{
+    const float32x4x3_t r =
+    {
+        {
+            vld1q_dup_f32(ptr),
+            vld1q_dup_f32(1 + ptr),
+            vld1q_dup_f32(2 + ptr)
+        }
+    };
+    return r;
+}
+
+template <unsigned int stridex>
+float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position);
+
+template <>
+inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+
+    const float32x4x3_t vtop =
+    {
+        {
+            vld1q_f32(in_top),
+            vld1q_f32(in_top + 4),
+            vld1q_f32(in_top + 8)
+        }
+    };
+    const float32x4x3_t vmid =
+    {
+        {
+            vld1q_f32(in_mid),
+            vld1q_f32(in_mid + 4),
+            vld1q_f32(in_mid + 8)
+        }
+    };
+    const float32x4x3_t vlow =
+    {
+        {
+            vld1q_f32(in_low),
+            vld1q_f32(in_low + 4),
+            vld1q_f32(in_low + 8)
+        }
+    };
+    float32x4x2_t out =
+    {
+        {
+            vmulq_f32(vtop.val[0], m0.val[0]),
+            vmulq_f32(vtop.val[1], m0.val[0])
+        }
+    };
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
+
+    out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
+
+    out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
+    return out;
+}
+
+template <unsigned int stridex>
+void store_results(float *buffer, const float32x4x2_t &values);
+
+template <>
+void store_results<1>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, values.val[0]);
+    vst1q_f32(buffer + 4, values.val[1]);
+}
+
+template <>
+void store_results<2>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, values.val[0]);
+}
+
+template <>
+void store_results<3>(float *buffer, const float32x4x2_t &values)
+{
+    vst1_f32(buffer, vget_low_f32(values.val[0]));
+}
+
+template <unsigned int stridex>
+int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration);
+
+template <>
+int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration;
+}
+
+template <>
+int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration << 1;
+}
+
+template <>
+int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration * 3;
+}
+}
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H__ */
+\ No newline at end of file
diff --git a/arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h b/arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h
new file mode 100644
index 000000000..c35855861
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H__
+#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H__
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace detail
+{
+/** Loads a 3x3 matrix as a row  (float).
+ *
+ * @param[in] ptr Pointer to a float 3x3 matrix.
+ *
+ * @return The loaded matrix.
+ */
+inline float32x4x3_t load_matrix_row(const float *ptr)
+{
+    const float32x4x3_t r =
+    {
+        {
+            vld1q_dup_f32(ptr),
+            vld1q_dup_f32(1 + ptr),
+            vld1q_dup_f32(2 + ptr)
+        }
+    };
+    return r;
+}
+
+/** Loads a 3x3 matrix as a row  (qint8_t).
+ *
+ * @param[in] ptr Pointer to a qint8 3x3 matrix.
+ *
+ * @return The loaded matrix.
+ */
+inline qint8x8x3_t load_matrix_row(const qint8_t *ptr)
+{
+    /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
+       r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
+    const qint8x8x3_t r =
+    {
+        {
+            vld1_dup_qs8(ptr),
+            vld1_dup_qs8(1 + ptr),
+            vld1_dup_qs8(2 + ptr)
+        }
+    };
+    return r;
+}
+
+/** Perform a convolve3x3 on float32.
+ *
+ * @param[in] in_top               Pointer to the first row of the input.
+ * @param[in] in_mid               Pointer to the second row of the input.
+ * @param[in] in_low               Pointer to the third row of the input.
+ * @param[in] m0                   First row of the filter.
+ * @param[in] m1                   Second row of the filter.
+ * @param[in] m2                   Third row of the filter.
+ * @param[in] fixed_point_position (Optional) Fixed point position.
+ *
+ */
+template <unsigned int stridex>
+float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position);
+
+template <>
+inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+
+    const float32x4x3_t vtop =
+    {
+        {
+            vld1q_f32(in_top),
+            vld1q_f32(in_top + 4),
+            vld1q_f32(in_top + 8)
+        }
+    };
+    const float32x4x3_t vmid =
+    {
+        {
+            vld1q_f32(in_mid),
+            vld1q_f32(in_mid + 4),
+            vld1q_f32(in_mid + 8)
+        }
+    };
+    const float32x4x3_t vlow =
+    {
+        {
+            vld1q_f32(in_low),
+            vld1q_f32(in_low + 4),
+            vld1q_f32(in_low + 8)
+        }
+    };
+    float32x4x2_t out =
+    {
+        {
+            vmulq_f32(vtop.val[0], m0.val[0]),
+            vmulq_f32(vtop.val[1], m0.val[0])
+        }
+    };
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
+
+    out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
+
+    out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
+
+    out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
+    return out;
+}
+
+/** Perform a convolve3x3 on qint16.
+ *
+ * @param[in] in_top               Pointer to the first row of the input.
+ * @param[in] in_mid               Pointer to the second row of the input.
+ * @param[in] in_low               Pointer to the third row of the input.
+ * @param[in] m0                   First row of the filter.
+ * @param[in] m1                   Second row of the filter.
+ * @param[in] m2                   Third row of the filter.
+ * @param[in] fixed_point_position (Optional) Fixed point position.
+ *
+ */
+template <unsigned int stridex>
+qint16x8x2_t convolve_3x3(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position);
+
+template <>
+inline qint16x8x2_t convolve_3x3<1>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+
+    const qint8x8x3_t vtop =
+    {
+        {
+            vld1_qs8(in_top),
+            vld1_qs8(in_top + 8),
+            vld1_qs8(in_top + 16)
+        }
+    };
+    const qint8x8x3_t vmid =
+    {
+        {
+            vld1_qs8(in_mid),
+            vld1_qs8(in_mid + 8),
+            vld1_qs8(in_mid + 16)
+        }
+    };
+    const qint8x8x3_t vlow =
+    {
+        {
+            vld1_qs8(in_low),
+            vld1_qs8(in_low + 8),
+            vld1_qs8(in_low + 16)
+        }
+    };
+    qint16x8x2_t out =
+    {
+        {
+            vmull_qs8(vtop.val[0], m0.val[0], fixed_point_position),
+            vmull_qs8(vtop.val[1], m0.val[0], fixed_point_position)
+        }
+    };
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 1), m0.val[1], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 2), m0.val[2], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vmid.val[0], m1.val[0], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 1), m1.val[1], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 2), m1.val[2], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vlow.val[0], m2.val[0], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 1), m2.val[1], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 2), m2.val[2], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 1), m0.val[1], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 2), m0.val[2], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vmid.val[1], m1.val[0], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 1), m1.val[1], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 2), m1.val[2], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vlow.val[1], m2.val[0], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 1), m2.val[1], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 2), m2.val[2], fixed_point_position);
+    return out;
+}
+
+template <>
+inline qint16x8x2_t convolve_3x3<2>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
+{
+    qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 2), out.val[0], 1);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 4), out.val[0], 2);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 3);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 0), out.val[0], 4);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 2), out.val[0], 5);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 4), out.val[0], 6);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 6), out.val[0], 7);
+    return out;
+}
+
+template <>
+inline qint16x8x2_t convolve_3x3<3>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
+{
+    qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 3), out.val[0], 1);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 2);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 1), out.val[0], 3);
+    return out;
+}
+
+/** Stores a float32x4x2_t array into a memory location.
+ *
+ * @param[in] buffer Pointer to the memory location where the values will be stored.
+ * @param[in] values Values that will be stored.
+ *
+ */
+template <unsigned int stridex>
+void store_results(float *buffer, const float32x4x2_t &values);
+
+template <>
+inline void store_results<1>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, values.val[0]);
+    vst1q_f32(buffer + 4, values.val[1]);
+}
+
+template <>
+inline void store_results<2>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, values.val[0]);
+}
+
+template <>
+inline void store_results<3>(float *buffer, const float32x4x2_t &values)
+{
+    vst1_f32(buffer, vget_low_f32(values.val[0]));
+}
+
+/** Stores a qint16_t array into a memory location.
+ *
+ * @param[in] buffer Pointer to the memory location where the values will be stored.
+ * @param[in] values Values that will be stored.
+ *
+ */
+template <unsigned int stridex>
+void store_results(qint16_t *buffer, const qint16x8x2_t &values);
+
+template <>
+inline void store_results<1>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1q_qs16(buffer, values.val[0]);
+    vst1q_qs16(buffer + 8, values.val[1]);
+}
+
+template <>
+inline void store_results<2>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1q_qs16(buffer, values.val[0]);
+}
+
+template <>
+inline void store_results<3>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1_qs16(buffer, vget_low_s16(values.val[0]));
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+/** Loads a 3x3 matrix as a row (float16_t).
+ *
+ * @param[in] ptr Pointer to a float 3x3 matrix.
+ *
+ * @return The loaded matrix.
+ */
+inline float16x8x3_t load_matrix_row(const float16_t *ptr)
+{
+    /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
+       r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
+    const float16x8x3_t r =
+    {
+        {
+            vld1q_dup_f16(ptr),
+            vld1q_dup_f16(1 + ptr),
+            vld1q_dup_f16(2 + ptr)
+        }
+    };
+    return r;
+}
+
+/** Perform a convolve3x3 on float16.
+ *
+ * @param[in] in_top               Pointer to the first row of the input.
+ * @param[in] in_mid               Pointer to the second row of the input.
+ * @param[in] in_low               Pointer to the third row of the input.
+ * @param[in] m0                   First row of the filter.
+ * @param[in] m1                   Second row of the filter.
+ * @param[in] m2                   Third row of the filter.
+ * @param[in] fixed_point_position (Optional) Fixed point position.
+ *
+ */
+template <unsigned int stridex>
+float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+                           int fixed_point_position);
+
+template <>
+inline float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+                                     int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+
+    const float16x8x3_t vtop =
+    {
+        {
+            vld1q_f16(in_top),
+            vld1q_f16(in_top + 8),
+            vld1q_f16(in_top + 16)
+        }
+    };
+    const float16x8x3_t vmid =
+    {
+        {
+            vld1q_f16(in_mid),
+            vld1q_f16(in_mid + 8),
+            vld1q_f16(in_mid + 16)
+        }
+    };
+    const float16x8x3_t vlow =
+    {
+        {
+            vld1q_f16(in_low),
+            vld1q_f16(in_low + 8),
+            vld1q_f16(in_low + 16)
+        }
+    };
+    float16x8x2_t out =
+    {
+        {
+            vmulq_f16(vtop.val[0], m0.val[0]),
+            vmulq_f16(vtop.val[1], m0.val[0])
+        }
+    };
+    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1]));
+    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2]));
+    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0]));
+    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 1), m1.val[1]));
+    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 2), m1.val[2]));
+    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0]));
+    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 1), m2.val[1]));
+    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 2), m2.val[2]));
+    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 1), m0.val[1]));
+    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 2), m0.val[2]));
+    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vmid.val[1], m1.val[0]));
+    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 1), m1.val[1]));
+    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 2), m1.val[2]));
+    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vlow.val[1], m2.val[0]));
+    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1]));
+    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2]));
+    return out;
+}
+
+template <>
+inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+                                     int fixed_point_position)
+{
+    float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 2);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 3);
+    return out;
+}
+
+template <>
+inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+                                     int fixed_point_position)
+{
+    float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
+    return out;
+}
+
+/** Stores a float16x8x2_t array into a memory location.
+ *
+ * @param[in] buffer Pointer to the memory location where the values will be stored.
+ * @param[in] values Values that will be stored.
+ *
+ */
+template <unsigned int stridex>
+void store_results(float16_t *buffer, const float16x8x2_t &values);
+
+template <>
+inline void store_results<1>(float16_t *buffer, const float16x8x2_t &values)
+{
+    vst1q_f16(buffer, values.val[0]);
+    vst1q_f16(buffer + 8, values.val[1]);
+}
+
+template <>
+inline void store_results<2>(float16_t *buffer, const float16x8x2_t &values)
+{
+    vst1q_f16(buffer, values.val[0]);
+}
+
+template <>
+inline void store_results<3>(float16_t *buffer, const float16x8x2_t &values)
+{
+    vst1_f16(buffer, vget_low_f16(values.val[0]));
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+/** Get the number of elements processed on 3x3 convolution.
+ *
+ * @param[in] num_elems_written_per_iteration Number of elements written per iteration on 3x3 convolution.
+ *
+ * @return The number of elements processed.
+ */
+template <unsigned int stridex>
+int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration);
+
+template <>
+inline int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration;
+}
+
+template <>
+inline int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration << 1;
+}
+
+template <>
+inline int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration * 3;
+}
+inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex)
+{
+    switch(stridex)
+    {
+        case 1:
+            return get_input_num_elems_processed<1>(num_elems_written_per_iteration);
+        case 2:
+            return get_input_num_elems_processed<2>(num_elems_written_per_iteration);
+        case 3:
+            return get_input_num_elems_processed<3>(num_elems_written_per_iteration);
+        default:
+            ARM_COMPUTE_ERROR("stridex not supported");
+            return 0;
+    }
+}
+}
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H__ */
diff --git a/arm_compute/core/NEON/kernels/winograd/alloc.hpp b/arm_compute/core/NEON/kernels/winograd/alloc.hpp
new file mode 100644
index 000000000..ef6f2b511
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/winograd/alloc.hpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef ALLOC_ALIGN
+#define ALLOCATE(x) aligned_alloc(ALLOC_ALIGN, x)
+#else
+#define ALLOCATE(x) malloc(x)
+#endif
diff --git a/arm_compute/core/NEON/kernels/winograd/tensor.hpp b/arm_compute/core/NEON/kernels/winograd/tensor.hpp
new file mode 100644
index 000000000..70ef65d2a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/winograd/tensor.hpp
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include "alloc.hpp"
+
+/*****************************************************************************/
+/* Padding definitions */
+enum PaddingType {
+  PADDING_SAME, PADDING_VALID
+};
+
+/*****************************************************************************/
+/* Shape of a kernel */
+struct KernelShape {
+  int n_output_channels, n_rows, n_cols, n_input_channels;
+
+  int size(void) const {
+    return n_output_channels * n_rows * n_cols * n_input_channels;
+  }
+};
+
+struct Tensor4DShape {
+  int n_batches,
+      n_rows,
+      n_cols,
+      n_channels;
+
+  int size() const {
+    return n_batches * n_rows * n_cols * n_channels;
+  }
+
+  bool TestEq(const Tensor4DShape& other) const {
+    return (n_batches == other.n_batches &&
+            n_rows == other.n_rows &&
+            n_cols == other.n_cols &&
+            n_channels == other.n_channels);
+  }
+};
+
+template <typename ShapeT, typename T>
+class Tensor4D final {
+  public:
+    Tensor4D(ShapeT shape) :
+      _shape(shape),
+      _data(reinterpret_cast<T*>(ALLOCATE(size_bytes()))) {
+        Clear();
+    }
+
+    ~Tensor4D() {
+      free(_data);
+    }
+
+    T* ptr() const {
+      return _data;
+    }
+
+    const ShapeT& shape() const {
+      return _shape;
+    }
+
+    size_t size_bytes() const {
+      return _shape.size() * sizeof(T);
+    }
+
+    bool TestEq(Tensor4D<ShapeT, T>& other) const;
+    T& element(int, int, int, int) const;
+    void Print() const;
+
+    void Clear() {
+      Fill(static_cast<T>(0));
+    }
+
+    void Fill(T val) {
+      for (int i = 0; i < _shape.size(); i++)
+        _data[i] = val;
+    }
+
+    void TestPattern() {
+      for (int i = 0; i < _shape.size(); i++)
+        _data[i] = static_cast<T>(i);
+    }
+
+    void Rand(const int seed=2311) {
+      std::mt19937 gen(seed);
+      std::uniform_int_distribution<> dis(-50, +50);
+
+      for (int i = 0; i < _shape.size(); i++) {
+        _data[i] = static_cast<T>(dis(gen));
+      }
+    }
+    Tensor4D(const Tensor4D &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    Tensor4D &operator=(const Tensor4D &) = delete;
+    /** Allow instances of this class to be moved */
+    Tensor4D(Tensor4D &&) = default;
+    /** Allow instances of this class to be moved */
+    Tensor4D &operator=(Tensor4D &&) = default;
+
+
+  private:
+    const ShapeT _shape;
+    T* const _data;
+};
+
+
+template <>
+inline float& Tensor4D<Tensor4DShape, float>::element(int n, int i, int j, int c) const {
+  int index = ((n*_shape.n_rows + i)*_shape.n_cols + j)*_shape.n_channels + c;
+  return _data[index];
+}
+
+
+template <>
+inline float& Tensor4D<KernelShape, float>::element(int oc, int i, int j, int ic) const {
+  int index = ((i*_shape.n_cols + j)*_shape.n_input_channels + ic)*_shape.n_output_channels + oc;
+  return _data[index];
+}
+
+template <>
+inline bool Tensor4D<Tensor4DShape, float>::TestEq(Tensor4D<Tensor4DShape, float>& other) const {
+  // Test equivalence, printing errors
+  // First test the shapes are the same
+  if (!_shape.TestEq(other.shape())) {
+    printf("Tensors have different shapes.\n");
+    return false;
+  } else {
+    int incorrects = 0;
+
+    for (int n = 0; n < _shape.n_batches; n++) {
+      for (int i = 0; i < _shape.n_rows; i++) {
+        for (int j = 0; j < _shape.n_cols; j++) {
+          for (int c = 0; c < _shape.n_channels; c++) {
+            // Check elements for equivalence
+            const auto a = this->element(n, i, j, c);
+            const auto b = other.element(n, i, j, c);
+
+            if (a != b) {
+              printf("Difference at element {%d, %d, %d, %d}: %.3f != %.3f\n", n, i, j, c, a, b);
+
+              if (++incorrects > 100) {
+                printf("More than 100 incorrect values, stopping test.\n");
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    return incorrects == 0;
+  }
+}
+
+
+template <>
+inline void Tensor4D<Tensor4DShape, float>::Print() const {
+  for (int n = 0; n < _shape.n_batches; n++) {
+    for (int c = 0; c < _shape.n_channels; c++) {
+      for (int i = 0; i < _shape.n_rows; i++) {
+        for (int j = 0; j < _shape.n_cols; j++) {
+          printf("%5.2f ", element(n, i, j, c));
+        }
+        printf("\n");
+      }
+      printf("\n");
+    }
+  }
+}
+
+
+template <>
+inline void Tensor4D<KernelShape, float>::Print() const {
+  for (int oc = 0; oc < _shape.n_output_channels; oc++) {
+    for (int ic = 0; ic < _shape.n_input_channels; ic++) {
+      for (int i = 0; i < _shape.n_rows; i++) {
+        for (int j = 0; j < _shape.n_cols; j++) {
+          printf("%5.2f ", element(oc, i, j, ic));
+        }
+        printf("\n");
+      }
+      printf("\n");
+    }
+  }
+}
diff --git a/arm_compute/core/QAsymm8.h b/arm_compute/core/QAsymm8.h
new file mode 100644
index 000000000..2fa402980
--- /dev/null
+++ b/arm_compute/core/QAsymm8.h
@@ -0,0 +1,33 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_QASYMM8_H__
+#define __ARM_COMPUTE_QASYMM8_H__
+
+#include "arm_compute/core/Rounding.h"
+#include <cstdint>
+
+namespace arm_compute
+{
+using qasymm8_t = uint8_t; /**< 8 bit quantized asymmetric scalar value */
+}
+#include "arm_compute/core/QAsymm8.inl"
+#endif /* __ARM_COMPUTE_QASYMM8_H__ */
diff --git a/arm_compute/core/QAsymm8.inl b/arm_compute/core/QAsymm8.inl
new file mode 100644
index 000000000..611d68eb2
--- /dev/null
+++ b/arm_compute/core/QAsymm8.inl
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <cmath>
+#include <limits>
+
+namespace arm_compute
+{
+inline qasymm8_t sqcvt_qasymm8_f32(float value, float scale, int offset, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+{
+    int quantized = arm_compute::round(value / scale, rounding_policy) + offset;
+    quantized     = std::max(0, std::min(quantized, 255));
+    return quantized;
+}
+
+inline float scvt_f32_qasymm8(qasymm8_t value, float scale, int offset)
+{
+    float dequantized = (static_cast<int>(value) - offset) * scale;
+    return dequantized;
+}
+}
diff --git a/arm_compute/core/Rounding.h b/arm_compute/core/Rounding.h
new file mode 100644
index 000000000..f95058c56
--- /dev/null
+++ b/arm_compute/core/Rounding.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ROUNDING_H__
+#define __ARM_COMPUTE_ROUNDING_H__
+
+namespace arm_compute
+{
+/** Rounding method */
+enum class RoundingPolicy
+{
+    TO_ZERO,         /**< Truncates the least significand values that are lost in operations. */
+    TO_NEAREST_UP,   /**< Rounds to nearest value; half rounds away from zero */
+    TO_NEAREST_EVEN, /**< Rounds to nearest value; half rounds to nearest even */
+};
+
+/** Return a rounded value of x. Rounding is done according to the rounding_policy.
+ *
+ * @param[in] x               Float value to be rounded.
+ * @param[in] rounding_policy Policy determining how rounding is done.
+ *
+ * @return Rounded value of the argument x.
+ */
+int round(float x, RoundingPolicy rounding_policy);
+}
+#endif /*__ARM_COMPUTE_ROUNDING_H__ */
diff --git a/arm_compute/core/Strides.h b/arm_compute/core/Strides.h
index 329fafb5f..105fdfde4 100644
--- a/arm_compute/core/Strides.h
+++ b/arm_compute/core/Strides.h
@@ -26,7 +26,6 @@
 
 #include "arm_compute/core/Dimensions.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Types.h"
 
 #include <algorithm>
 #include <array>
@@ -58,5 +57,5 @@ public:
     /** Default destructor */
     ~Strides() = default;
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_STRIDES_H__*/
diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
index 54fb66a57..7c464c0b1 100644
--- a/arm_compute/core/SubTensorInfo.h
+++ b/arm_compute/core/SubTensorInfo.h
@@ -34,6 +34,7 @@
 #include "arm_compute/core/Validate.h"
 
 #include <cstddef>
+#include <memory>
 
 namespace arm_compute
 {
@@ -50,7 +51,7 @@ public:
      *                         X and Y dimensions must match the parent's ones.
      * @param[in] coords       Coordinates of starting element inside parent tensor.
      */
-    SubTensorInfo(ITensorInfo *parent, const TensorShape &tensor_shape, const Coordinates &coords);
+    SubTensorInfo(ITensorInfo *parent, TensorShape tensor_shape, Coordinates coords);
     /** Default destructor */
     ~SubTensorInfo() = default;
     /** Allow instances of this class to be copy constructed */
@@ -61,29 +62,54 @@ public:
     SubTensorInfo(SubTensorInfo &&) = default;
     /** Allow instances of this class to be moved */
     SubTensorInfo &operator=(SubTensorInfo &&) = default;
+    /** Returns the coordinates of the sub-tensor inside the parent tensor
+     *
+     * @return Sub-tensor coordinates
+     */
+    Coordinates coords() const
+    {
+        return _coords;
+    }
 
     // Inherited methods overridden:
-    void set_data_type(DataType data_type) override
+    std::unique_ptr<ITensorInfo> clone() const override;
+    ITensorInfo &set_data_type(DataType data_type) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         _parent->set_data_type(data_type);
+        return *this;
     };
-    void set_num_channels(int num_channels) override
+    ITensorInfo &set_num_channels(int num_channels) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         _parent->set_num_channels(num_channels);
+        return *this;
     };
-    void set_format(Format format) override
+    ITensorInfo &set_format(Format format) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         _parent->set_format(format);
+        return *this;
     };
-    void set_fixed_point_position(int fixed_point_position) override
+    ITensorInfo &set_fixed_point_position(int fixed_point_position) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         _parent->set_fixed_point_position(fixed_point_position);
+        return *this;
     };
-    void set_tensor_shape(TensorShape shape) override;
+    ITensorInfo &set_tensor_shape(TensorShape shape) override;
+    ITensorInfo &set_quantization_info(QuantizationInfo quantization_info) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_quantization_info(quantization_info);
+        return *this;
+    }
+    ITensorInfo &reset_padding() override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->reset_padding();
+        return *this;
+    }
     bool auto_padding() override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
@@ -159,10 +185,11 @@ public:
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         return _parent->is_resizable();
     }
-    void set_is_resizable(bool is_resizable) override
+    ITensorInfo &set_is_resizable(bool is_resizable) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         _parent->set_is_resizable(is_resizable);
+        return *this;
     }
     ValidRegion valid_region() const override
     {
@@ -171,9 +198,18 @@ public:
     void set_valid_region(ValidRegion valid_region) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
-        ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(_parent->valid_region(), valid_region);
+        // Check if subtensor is valid if parent is configured
+        if(_parent->tensor_shape().total_size() != 0)
+        {
+            ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(_parent->valid_region(), valid_region);
+        }
         _valid_region = std::move(valid_region);
     }
+    QuantizationInfo quantization_info() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->quantization_info();
+    }
 
 private:
     ITensorInfo *_parent;
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index 35b9ccb9f..80ef7f8d5 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/ITensorInfo.h"
 
+#include "ITensorInfo.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorShape.h"
@@ -33,6 +34,7 @@
 #include "arm_compute/core/Utils.h"
 
 #include <cstddef>
+#include <memory>
 
 namespace arm_compute
 {
@@ -97,6 +99,16 @@ public:
      * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
      */
     TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0);
+
+    /** Constructor
+     *
+     * @param[in] tensor_shape      It specifies the size for each dimension of the tensor in number of elements.
+     * @param[in] num_channels      It indicates the number of channels for each tensor element
+     * @param[in] data_type         Data type to use for each tensor element
+     * @param[in] quantization_info The quantization settings for the tensor data.
+     */
+    TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, QuantizationInfo quantization_info);
+
     /** Constructor
      *
      * @param[in] hog_info HOG's metadata used to allocate normalized HOG space
@@ -147,6 +159,7 @@ public:
      * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
      */
     void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0);
+
     /** Initialize the metadata structure with the given parameters
      *
      * @param[in] tensor_shape                  Size for each dimension of the tensor in number of elements.
@@ -200,12 +213,15 @@ public:
     size_t init_auto_padding(const HOGInfo &hog_info, unsigned int width, unsigned int height);
 
     // Inherited methods overridden:
-    void set_data_type(DataType data_type) override;
-    void set_num_channels(int num_channels) override;
-    void set_format(Format format) override;
-    void set_tensor_shape(TensorShape shape) override;
-    void set_fixed_point_position(int fixed_point_position) override;
-    bool auto_padding() override;
+    std::unique_ptr<ITensorInfo> clone() const override;
+    ITensorInfo &set_data_type(DataType data_type) override;
+    ITensorInfo &set_num_channels(int num_channels) override;
+    ITensorInfo &set_format(Format format) override;
+    ITensorInfo &set_tensor_shape(TensorShape shape) override;
+    ITensorInfo &set_fixed_point_position(int fixed_point_position) override;
+    ITensorInfo &set_quantization_info(QuantizationInfo quantization_info) override;
+    ITensorInfo &reset_padding() override;
+    bool         auto_padding() override;
     bool extend_padding(const PaddingSize &padding) override;
     size_t dimension(size_t index) const override
     {
@@ -264,9 +280,10 @@ public:
     {
         return _is_resizable;
     }
-    void set_is_resizable(bool is_resizable) override
+    ITensorInfo &set_is_resizable(bool is_resizable) override
     {
         _is_resizable = is_resizable;
+        return *this;
     }
     ValidRegion valid_region() const override
     {
@@ -276,6 +293,10 @@ public:
     {
         _valid_region = std::move(valid_region);
     }
+    QuantizationInfo quantization_info() const override
+    {
+        return _quantization_info;
+    }
 
 private:
     /** Calculates strides, offset and total size resulting from the specified padding around the XY plane.
@@ -284,17 +305,18 @@ private:
      */
     std::tuple<Strides, size_t, size_t> calculate_padding_requirements(const PaddingSize &padding);
 
-    size_t      _total_size;
-    int         _fixed_point_position;
-    size_t      _offset_first_element_in_bytes;
-    Strides     _strides_in_bytes;
-    size_t      _num_channels;
-    TensorShape _tensor_shape;
-    DataType    _data_type;
-    Format      _format;
-    bool        _is_resizable;
-    ValidRegion _valid_region;
-    PaddingSize _padding;
+    size_t           _total_size;
+    int              _fixed_point_position;
+    size_t           _offset_first_element_in_bytes;
+    Strides          _strides_in_bytes;
+    size_t           _num_channels;
+    TensorShape      _tensor_shape;
+    DataType         _data_type;
+    Format           _format;
+    bool             _is_resizable;
+    ValidRegion      _valid_region;
+    PaddingSize      _padding;
+    QuantizationInfo _quantization_info;
 };
 }
 #endif /*__ARM_COMPUTE_TENSORINFO_H__ */
diff --git a/arm_compute/core/TensorShape.h b/arm_compute/core/TensorShape.h
index 3b395e74c..ad102607e 100644
--- a/arm_compute/core/TensorShape.h
+++ b/arm_compute/core/TensorShape.h
@@ -117,8 +117,8 @@ public:
 
     /** Collapse the first n dimensions.
      *
+     * @param[in] n     Number of dimensions to collapse into @p first
      * @param[in] first Dimensions into which the following @p n are collapsed.
-     * @param[in] n     Number of dimensions to collapse into @p first.
      */
     void collapse(size_t n, size_t first = 0)
     {
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index f9766b39b..538449b40 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -25,9 +25,13 @@
 #define __ARM_COMPUTE_TYPES_H__
 
 #include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/QAsymm8.h"
+#include "arm_compute/core/Rounding.h"
+#include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorShape.h"
 #include "support/Half.h"
 
+#include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <string>
@@ -38,26 +42,29 @@ namespace arm_compute
 /** 16-bit floating point type */
 using half = half_float::half;
 
+/** Permutation vector */
+using PermutationVector = Strides;
+
 /** Image colour formats */
 enum class Format
 {
-    UNKNOWN,  /** Unknown image format */
-    U8,       /** 1 channel, 1 U8 per channel */
-    S16,      /** 1 channel, 1 S16 per channel */
-    U16,      /** 1 channel, 1 U16 per channel */
-    S32,      /** 1 channel, 1 S32 per channel */
-    U32,      /** 1 channel, 1 U32 per channel */
-    F16,      /** 1 channel, 1 F16 per channel */
-    F32,      /** 1 channel, 1 F32 per channel */
-    UV88,     /** 2 channel, 1 U8 per channel */
-    RGB888,   /** 3 channels, 1 U8 per channel */
-    RGBA8888, /** 4 channels, 1 U8 per channel */
-    YUV444,   /** A 3 plane of 8 bit 4:4:4 sampled Y, U, V planes */
-    YUYV422,  /** A single plane of 32-bit macro pixel of Y0, U0, Y1, V0 bytes */
-    NV12,     /** A 2 plane YUV format of Luma (Y) and interleaved UV data at 4:2:0 sampling */
-    NV21,     /** A 2 plane YUV format of Luma (Y) and interleaved VU data at 4:2:0 sampling */
-    IYUV,     /** A 3 plane of 8-bit 4:2:0 sampled Y, U, V planes */
-    UYVY422   /** A single plane of 32-bit macro pixel of U0, Y0, V0, Y1 byte */
+    UNKNOWN,  /**< Unknown image format */
+    U8,       /**< 1 channel, 1 U8 per channel */
+    S16,      /**< 1 channel, 1 S16 per channel */
+    U16,      /**< 1 channel, 1 U16 per channel */
+    S32,      /**< 1 channel, 1 S32 per channel */
+    U32,      /**< 1 channel, 1 U32 per channel */
+    F16,      /**< 1 channel, 1 F16 per channel */
+    F32,      /**< 1 channel, 1 F32 per channel */
+    UV88,     /**< 2 channel, 1 U8 per channel */
+    RGB888,   /**< 3 channels, 1 U8 per channel */
+    RGBA8888, /**< 4 channels, 1 U8 per channel */
+    YUV444,   /**< A 3 plane of 8 bit 4:4:4 sampled Y, U, V planes */
+    YUYV422,  /**< A single plane of 32-bit macro pixel of Y0, U0, Y1, V0 bytes */
+    NV12,     /**< A 2 plane YUV format of Luma (Y) and interleaved UV data at 4:2:0 sampling */
+    NV21,     /**< A 2 plane YUV format of Luma (Y) and interleaved VU data at 4:2:0 sampling */
+    IYUV,     /**< A 3 plane of 8-bit 4:2:0 sampled Y, U, V planes */
+    UYVY422   /**< A single plane of 32-bit macro pixel of U0, Y0, V0, Y1 byte */
 };
 
 /** Available data types */
@@ -67,6 +74,7 @@ enum class DataType
     U8,
     S8,
     QS8,
+    QASYMM8,
     U16,
     S16,
     QS16,
@@ -81,6 +89,13 @@ enum class DataType
     SIZET
 };
 
+/** Available Sampling Policies */
+enum class SamplingPolicy
+{
+    CENTER,  /**< Samples are taken at pixel center */
+    TOP_LEFT /**< Samples are taken at pixel top left corner */
+};
+
 /** Constant value of the border pixels when using BorderMode::CONSTANT */
 constexpr uint8_t CONSTANT_BORDER_VALUE = 199;
 
@@ -90,6 +105,53 @@ constexpr float SCALE_PYRAMID_HALF = 0.5f;
 /* Constant value used to indicate a ORB scaled pyramid */
 constexpr float SCALE_PYRAMID_ORB = 8.408964152537146130583778358414e-01;
 
+/** Quantization settings (used for QASYMM8 data type) */
+struct QuantizationInfo
+{
+    QuantizationInfo()
+        : scale(0.0f), offset(0)
+    {
+    }
+
+    QuantizationInfo(float scale, int offset)
+        : scale(scale), offset(offset)
+    {
+    }
+
+    bool operator==(const QuantizationInfo &other)
+    {
+        return scale == other.scale && offset == other.offset;
+    }
+
+    bool operator!=(const QuantizationInfo &other)
+    {
+        return !(*this == other);
+    }
+
+    float scale;  /**< scale */
+    int   offset; /**< offset */
+
+    /** Quantizes a value using the scale/offset in this QuantizationInfo */
+    qasymm8_t quantize(float value, RoundingPolicy rounding_policy) const
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(scale == 0, "QuantizationInfo::quantize: scale == 0");
+        return sqcvt_qasymm8_f32(value, scale, offset, rounding_policy);
+    }
+
+    /** Dequantizes a value using the scale/offset in this QuantizationInfo */
+    float dequantize(qasymm8_t value) const
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(scale == 0, "QuantizationInfo::dequantize: scale == 0");
+        return scvt_f32_qasymm8(value, scale, offset);
+    }
+
+    /** Indicates whether this QuantizationInfo has valid settings or not */
+    bool empty() const
+    {
+        return scale == 0;
+    }
+};
+
 struct ValidRegion
 {
     ValidRegion()
@@ -234,14 +296,6 @@ enum class ThresholdType
     RANGE   /**< Threshold with two values*/
 };
 
-/** Rounding method */
-enum class RoundingPolicy
-{
-    TO_ZERO,        /**< Truncates the least significand values that are lost in operations. */
-    TO_NEAREST_UP,  /**< Rounds to nearest value; half rounds up */
-    TO_NEAREST_EVEN /**< Rounds to nearest value; half rounds to nearest even */
-};
-
 /** Termination criteria */
 enum class Termination
 {
@@ -418,7 +472,32 @@ public:
                   unsigned int pad_x = 0, unsigned int pad_y = 0,
                   DimensionRoundingType round = DimensionRoundingType::FLOOR)
         : _stride(std::make_pair(stride_x, stride_y)),
-          _pad(std::make_pair(pad_x, pad_y)),
+          _pad_left(pad_x),
+          _pad_top(pad_y),
+          _pad_right(pad_x),
+          _pad_bottom(pad_y),
+          _round_type(round)
+    {
+    }
+    /** Constructor
+     *
+     * @param[in] stride_x   Stride, in elements, across x.
+     * @param[in] stride_y   Stride, in elements, across y.
+     * @param[in] pad_left   Padding across x on the left, in elements.
+     * @param[in] pad_top    Padding across y on the top, in elements.
+     * @param[in] pad_right  Padding across x on the right, in elements.
+     * @param[in] pad_bottom Padding across y on the bottom, in elements.
+     * @param[in] round      Dimensions rounding.
+     */
+    PadStrideInfo(unsigned int stride_x, unsigned int stride_y,
+                  unsigned int pad_left, unsigned int pad_right,
+                  unsigned int pad_top, unsigned int pad_bottom,
+                  DimensionRoundingType round)
+        : _stride(std::make_pair(stride_x, stride_y)),
+          _pad_left(pad_left),
+          _pad_top(pad_top),
+          _pad_right(pad_right),
+          _pad_bottom(pad_bottom),
           _round_type(round)
     {
     }
@@ -428,16 +507,45 @@ public:
     }
     std::pair<unsigned int, unsigned int> pad() const
     {
-        return _pad;
+        //this accessor should be used only when padding is symmetric
+        ARM_COMPUTE_ERROR_ON(_pad_left != _pad_right || _pad_top != _pad_bottom);
+        return std::make_pair(_pad_left, _pad_top);
+    }
+
+    unsigned int pad_left() const
+    {
+        return _pad_left;
+    }
+    unsigned int pad_right() const
+    {
+        return _pad_right;
+    }
+    unsigned int pad_top() const
+    {
+        return _pad_top;
     }
+    unsigned int pad_bottom() const
+    {
+        return _pad_bottom;
+    }
+
     DimensionRoundingType round() const
     {
         return _round_type;
     }
 
+    bool has_padding() const
+    {
+        return (_pad_left != 0 || _pad_top != 0 || _pad_right != 0 || _pad_bottom != 0);
+    }
+
 private:
     std::pair<unsigned int, unsigned int> _stride;
-    std::pair<unsigned int, unsigned int> _pad;
+    unsigned int _pad_left;
+    unsigned int _pad_top;
+    unsigned int _pad_right;
+    unsigned int _pad_bottom;
+
     DimensionRoundingType _round_type;
 };
 
@@ -445,14 +553,35 @@ private:
 class PoolingLayerInfo
 {
 public:
+    /** Default Constructor */
+    PoolingLayerInfo()
+        : _pool_type(PoolingType::MAX), _pool_size(0), _pad_stride_info(PadStrideInfo()), _exclude_padding(false), _is_global_pooling(false)
+    {
+    }
     /** Default Constructor
      *
-     * @param[in] pool_type       Pooling type @ref PoolingType. Defaults to @ref PoolingType::MAX
-     * @param[in] pool_size       (Optional) Pooling size, in elements, across  x and y. Defaults to 2.
+     * @param[in] pool_type       Pooling type @ref PoolingType.
+     * @param[in] pool_size       Pooling size, in elements, across  x and y.
      * @param[in] pad_stride_info (Optional) Padding and stride information @ref PadStrideInfo
+     * @param[in] exclude_padding (Optional) Strategy when accounting padding in calculations.
+     *                             True will exclude padding while false will not (Used in AVG/L2 pooling to determine the pooling area).
+     *                             Defaults to false;
+     */
+    explicit PoolingLayerInfo(PoolingType   pool_type,
+                              unsigned int  pool_size,
+                              PadStrideInfo pad_stride_info = PadStrideInfo(),
+                              bool          exclude_padding = false)
+        : _pool_type(pool_type), _pool_size(pool_size), _pad_stride_info(pad_stride_info), _exclude_padding(exclude_padding), _is_global_pooling(false)
+    {
+    }
+    /** Default Constructor
+     *
+     * @note This constructor is used for global pooling
+     *
+     * @param[in] pool_type Pooling type @ref PoolingType.
      */
-    PoolingLayerInfo(PoolingType pool_type = PoolingType::MAX, unsigned int pool_size = 2, PadStrideInfo pad_stride_info = PadStrideInfo())
-        : _pool_type(pool_type), _pool_size(pool_size), _pad_stride_info(pad_stride_info)
+    explicit PoolingLayerInfo(PoolingType pool_type)
+        : _pool_type(pool_type), _pool_size(0), _pad_stride_info(PadStrideInfo(1, 1, 0, 0)), _exclude_padding(false), _is_global_pooling(true)
     {
     }
     PoolingType pool_type() const
@@ -467,11 +596,21 @@ public:
     {
         return _pad_stride_info;
     }
+    bool exclude_padding() const
+    {
+        return _exclude_padding;
+    }
+    bool is_global_pooling() const
+    {
+        return _is_global_pooling;
+    }
 
 private:
     PoolingType   _pool_type;
     unsigned int  _pool_size;
     PadStrideInfo _pad_stride_info;
+    bool          _exclude_padding;
+    bool          _is_global_pooling;
 };
 
 /** ROI Pooling Layer Information class */
@@ -565,12 +704,14 @@ public:
      *
      * @param[in] type      The normalization type. Can be @ref NormType::IN_MAP_1D, @ref NormType::IN_MAP_2D or @ref NORM_TYPE::CROSS_MAP
      * @param[in] norm_size The normalization size is the number of elements to normalize across. Defaults to 5.
-     * @param[in] alpha     Alpha parameter used by normalization equation. Defaults to 0.0001.
-     * @param[in] beta      Beta parameter used by normalization equation. Defaults to 0.5.
-     * @param[in] kappa     Kappa parameter used by [Krichevksy 2012] Across Channel Local Brightness Normalization equation.
+     * @param[in] alpha     (Optional) Alpha parameter used by normalization equation. Defaults to 0.0001.
+     * @param[in] beta      (Optional) Beta parameter used by normalization equation. Defaults to 0.5.
+     * @param[in] kappa     (Optional) Kappa parameter used by [Krichevksy 2012] Across Channel Local Brightness Normalization equation.
+     * @param[in] is_scaled (Optional) Boolean that specifies if alpha will be scaled by the normalization size or not.
+     *                      Should be false to follow [Krichevksy 2012].
      */
-    NormalizationLayerInfo(NormType type, uint32_t norm_size = 5, float alpha = 0.0001f, float beta = 0.5f, float kappa = 1.f)
-        : _type(type), _norm_size(norm_size), _alpha(alpha), _beta(beta), _kappa(kappa)
+    NormalizationLayerInfo(NormType type, uint32_t norm_size = 5, float alpha = 0.0001f, float beta = 0.5f, float kappa = 1.f, bool is_scaled = true)
+        : _type(type), _norm_size(norm_size), _alpha(alpha), _beta(beta), _kappa(kappa), _is_scaled(is_scaled)
     {
     }
     NormType type() const
@@ -593,17 +734,25 @@ public:
     {
         return _kappa;
     }
-    /** Return the scaling factor of the normalization function. If kappa is not
-     * 1 then [Krichevksy 2012] normalization scaling is specified. Scaling
-     * factor takes into account the total number of elements used for the
-     * normalization, so in case of 2 dimensions this is _norm_size^2.
+    bool is_cross_map() const
+    {
+        return _type == NormType::CROSS_MAP;
+    }
+    bool is_in_map() const
+    {
+        return !is_cross_map();
+    }
+    /** Return the scaling factor of the normalization function.
+     *
+     * If is_scaled is set to false then [Krichevksy 2012] normalization scaling is performed,
+     * where alpha is returned plainly, else alpha is scaled by the total number of elements used for the normalization.
      *
      * @return The normalization scaling factor.
      */
     float scale_coeff() const
     {
         const uint32_t size = (_type == NormType::IN_MAP_2D) ? _norm_size * _norm_size : _norm_size;
-        return (_kappa == 1.f) ? (_alpha / size) : _alpha;
+        return (_is_scaled) ? (_alpha / size) : _alpha;
     }
 
 private:
@@ -612,6 +761,7 @@ private:
     float    _alpha;
     float    _beta;
     float    _kappa;
+    bool     _is_scaled;
 };
 
 /** Convolution Layer Weights Information class. This class stores the necessary information to compute convolution layer when the weights are already reshaped */
@@ -666,6 +816,58 @@ private:
     const unsigned int _num_kernels;
 };
 
+/** GEMM Information class. This class stores the necessary information to compute GEMM functions */
+class GEMMInfo
+{
+public:
+    /** Default constructor */
+    GEMMInfo()
+        : _is_a_reshaped(false), _is_b_reshaped(false), _reshape_b_only_on_first_run(false)
+    {
+    }
+    /** Constructor
+     *
+     * @param[in] is_a_reshaped               True if the matrix A has been reshaped
+     * @param[in] is_b_reshaped               True if the matrix B has been reshaped
+     * @param[in] reshape_b_only_on_first_run Reshape matrix B only for the first run
+     */
+    GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run)
+        : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), _reshape_b_only_on_first_run(reshape_b_only_on_first_run)
+    {
+    }
+    /** Flag which specifies if the matrix A has been reshaped
+     *
+     * @return True if the matrix A has been reshaped
+     */
+    bool is_a_reshaped() const
+    {
+        return _is_a_reshaped;
+    };
+    /** Flag which specifies if the matrix B has been reshaped
+     *
+     * @return True if the matrix B has been reshaped
+     */
+    bool is_b_reshaped() const
+    {
+        return _is_b_reshaped;
+    };
+    /** Flag which specifies if the reshape of matrix B should executed only for the first
+     *
+     * @note This flag could be set to TRUE when GEMM is used to accelerate convolution layer
+     *
+     * @return True if the reshaped of matrix B happens only for the first run
+     */
+    bool reshape_b_only_on_first_run() const
+    {
+        return _reshape_b_only_on_first_run;
+    };
+
+private:
+    const bool _is_a_reshaped;
+    const bool _is_b_reshaped;
+    const bool _reshape_b_only_on_first_run;
+};
+
 /** IO formatting information class*/
 struct IOFormatInfo
 {
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 06d674644..f78add13f 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -25,6 +25,7 @@
 #define __ARM_COMPUTE_UTILS_H__
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/Rounding.h"
 #include "arm_compute/core/Types.h"
 
 #include <algorithm>
@@ -92,6 +93,7 @@ inline size_t data_size_from_type(DataType data_type)
         case DataType::U8:
         case DataType::S8:
         case DataType::QS8:
+        case DataType::QASYMM8:
             return 1;
         case DataType::U16:
         case DataType::S16:
@@ -166,6 +168,7 @@ inline size_t element_size_from_data_type(DataType dt)
         case DataType::S8:
         case DataType::U8:
         case DataType::QS8:
+        case DataType::QASYMM8:
             return 1;
         case DataType::U16:
         case DataType::S16:
@@ -344,15 +347,52 @@ inline size_t num_channels_from_format(Format format)
     }
 }
 
+/** Return the promoted data type of a given data type.
+ *
+ * @note If promoted data type is not supported an error will be thrown
+ *
+ * @param[in] dt Data type to get the promoted type of.
+ *
+ * @return Promoted data type
+ */
+inline DataType get_promoted_data_type(DataType dt)
+{
+    switch(dt)
+    {
+        case DataType::U8:
+            return DataType::U16;
+        case DataType::S8:
+            return DataType::S16;
+        case DataType::QS8:
+            return DataType::QS16;
+        case DataType::U16:
+            return DataType::U32;
+        case DataType::S16:
+            return DataType::S32;
+        case DataType::QS16:
+            return DataType::QS32;
+        case DataType::QASYMM8:
+        case DataType::F16:
+        case DataType::U32:
+        case DataType::S32:
+        case DataType::F32:
+        case DataType::QS32:
+            ARM_COMPUTE_ERROR("Unsupported data type promotions!");
+        default:
+            ARM_COMPUTE_ERROR("Undefined data type!");
+    }
+    return DataType::UNKNOWN;
+}
+
 /** Separate a 2D convolution into two 1D convolutions
-*
-* @param[in]  conv     2D convolution
-* @param[out] conv_col 1D vertical convolution
-* @param[out] conv_row 1D horizontal convolution
-* @param[in]  size     Size of the 2D convolution
-*
-* @return true if the separation was successful
-*/
+ *
+ * @param[in]  conv     2D convolution
+ * @param[out] conv_col 1D vertical convolution
+ * @param[out] conv_row 1D horizontal convolution
+ * @param[in]  size     Size of the 2D convolution
+ *
+ * @return true if the separation was successful
+ */
 inline bool separate_matrix(const int16_t *conv, int16_t *conv_col, int16_t *conv_row, uint8_t size)
 {
     int32_t min_col     = -1;
@@ -562,6 +602,38 @@ inline DataType data_type_for_convolution_matrix(const int16_t *conv, size_t siz
     }
 }
 
+/** Returns expected shape for the deconvolution output tensor.
+ *
+ * @param[in] out_dims widht and height of the output tensor, these values can be obtained with the function deconvolution_output_dimensions.
+ * @param[in] input    Shape of the input tensor.
+ * @param[in] weights  Shape of the weights tensor.
+ *
+ * @return Deconvolution output tensor shape.
+ */
+TensorShape deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, TensorShape input, TensorShape weights);
+
+/** Returns expected width and height of the deconvolution's output tensor.
+ *
+ * @param[in] in_width      Width of input tensor (Number of columns)
+ * @param[in] in_height     Height of input tensor (Number of rows)
+ * @param[in] kernel_width  Kernel width.
+ * @param[in] kernel_height Kernel height.
+ * @param[in] padx          X axis padding.
+ * @param[in] pady          Y axis padding.
+ * @param[in] ax            The number of zeros added to right edge of the input.
+ * @param[in] ay            The number of zeros added to top edge of the input.
+ * @param[in] upscalex      How much to scale the X axis.
+ * @param[in] upscaley      How much to scale the Y axis.
+ * @param[in] round         Rounding policy to be used when computing the output's dimensions.
+ *
+ * @return A pair with the new width in the first position and the new height in the second.
+ */
+
+const std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height,
+                                                                            unsigned int kernel_width, unsigned int kernel_height,
+                                                                            unsigned int padx, unsigned int pady, unsigned int ax, unsigned int ay,
+                                                                            float upscalex, float upscaley, DimensionRoundingType round);
+
 /** Returns expected width and height of output scaled tensor depending on dimensions rounding mode.
  *
  * @param[in] width           Width of input tensor (Number of columns)
@@ -674,6 +746,28 @@ inline bool is_data_type_float(DataType dt)
     }
 }
 
+/** Check if a given data type is of quantized type
+ *
+ * @note Quantized is considered a super-set of fixed-point and asymmetric data types.
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of quantized type, else false.
+ */
+inline bool is_data_type_quantized(DataType dt)
+{
+    switch(dt)
+    {
+        case DataType::QS8:
+        case DataType::QASYMM8:
+        case DataType::QS16:
+        case DataType::QS32:
+            return true;
+        default:
+            return false;
+    }
+}
+
 /** Check if a given data type is of fixed point type
  *
  * @param[in] dt Input data type.
@@ -693,6 +787,23 @@ inline bool is_data_type_fixed_point(DataType dt)
     }
 }
 
+/** Check if a given data type is of asymmetric quantized type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of symmetric quantized type, else false.
+ */
+inline bool is_data_type_quantized_asymmetric(DataType dt)
+{
+    switch(dt)
+    {
+        case DataType::QASYMM8:
+            return true;
+        default:
+            return false;
+    }
+}
+
 /** Create a string with the float in full precision.
  *
  * @param val Floating point value
@@ -727,7 +838,16 @@ void print_consecutive_elements_impl(std::ostream &s, const T *ptr, unsigned int
         {
             s.width(stream_width);
         }
-        s << std::right << static_cast<print_type>(ptr[i]) << element_delim;
+
+        if(std::is_same<typename std::decay<T>::type, half>::value)
+        {
+            // We use T instead of print_type here is because the std::is_floating_point<half> returns false and then the print_type becomes int.
+            s << std::right << static_cast<T>(ptr[i]) << element_delim;
+        }
+        else
+        {
+            s << std::right << static_cast<print_type>(ptr[i]) << element_delim;
+        }
     }
 }
 
@@ -749,7 +869,17 @@ int max_consecutive_elements_display_width_impl(std::ostream &s, const T *ptr, u
     {
         std::stringstream ss;
         ss.copyfmt(s);
-        ss << static_cast<print_type>(ptr[i]);
+
+        if(std::is_same<typename std::decay<T>::type, half>::value)
+        {
+            // We use T instead of print_type here is because the std::is_floating_point<half> returns false and then the print_type becomes int.
+            ss << static_cast<T>(ptr[i]);
+        }
+        else
+        {
+            ss << static_cast<print_type>(ptr[i]);
+        }
+
         max_width = std::max<int>(max_width, ss.str().size());
     }
     return max_width;
diff --git a/arm_compute/core/Validate.h b/arm_compute/core/Validate.h
index 34da339f0..4ef0e1143 100644
--- a/arm_compute/core/Validate.h
+++ b/arm_compute/core/Validate.h
@@ -64,9 +64,9 @@ inline bool have_different_dimensions(const Dimensions<T> &dim1, const Dimension
 /** Functor to compare two @ref Dimensions objects and throw an error on mismatch.
  *
  * @param[in] dim      Object to compare against.
- * @param[in] function Function in which the error occured.
- * @param[in] file     File in which the error occured.
- * @param[in] line     Line in which the error occured.
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     File in which the error occurred.
+ * @param[in] line     Line in which the error occurred.
  */
 template <typename T>
 class compare_dimension
@@ -81,10 +81,11 @@ public:
      *
      * @param[in] dim To be compared object.
      */
-    void operator()(const Dimensions<T> &dim)
+    arm_compute::Status operator()(const Dimensions<T> &dim)
     {
-        ARM_COMPUTE_ERROR_ON_LOC_MSG(have_different_dimensions(_dim, dim, 0), _function, _file, _line,
-                                     "Objects have different dimensions");
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(have_different_dimensions(_dim, dim, 0), _function, _file, _line,
+                                            "Objects have different dimensions");
+        return arm_compute::Status{};
     }
 
 private:
@@ -93,264 +94,462 @@ private:
     const char *const    _file;
     const int            _line;
 };
+
+template <typename F>
+inline arm_compute::Status for_each_error(F &&)
+{
+    return arm_compute::Status{};
+}
+
+template <typename F, typename T, typename... Ts>
+inline arm_compute::Status for_each_error(F &&func, T &&arg, Ts &&... args)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(func(arg));
+    ARM_COMPUTE_RETURN_ON_ERROR(for_each_error(func, args...));
+    return arm_compute::Status{};
+}
+
+template <typename T>
+struct get_tensor_info_t;
+template <>
+struct get_tensor_info_t<ITensorInfo *>
+{
+    ITensorInfo *operator()(const ITensor *tensor)
+    {
+        return tensor->info();
+    }
+};
 } // namespace detail
-/** Throw an error if one of the pointers is a nullptr.
+
+/** Create an error if one of the pointers is a nullptr.
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] pointers Pointers to check against nullptr.
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] pointers Pointers to check against nullptr.
+ * @return Status
  */
 template <typename... Ts>
-void error_on_nullptr(const char *function, const char *file, const int line, Ts &&... pointers)
+inline arm_compute::Status error_on_nullptr(const char *function, const char *file, const int line, Ts &&... pointers)
 {
-    auto is_nullptr = [&](const void *ptr)
+    const std::array<const void *, sizeof...(Ts)> pointers_array{ { std::forward<Ts>(pointers)... } };
+    bool has_nullptr = std::any_of(pointers_array.begin(), pointers_array.end(), [&](const void *ptr)
     {
-        ARM_COMPUTE_ERROR_ON_LOC(ptr == nullptr, function, file, line);
-    };
-
-    for_each(is_nullptr, std::forward<Ts>(pointers)...);
+        return (ptr == nullptr);
+    });
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(has_nullptr, function, file, line, "Nullptr object!");
+    return arm_compute::Status{};
 }
-#define ARM_COMPUTE_ERROR_ON_NULLPTR(...) ::arm_compute::error_on_nullptr(__func__, __FILE__, __LINE__, __VA_ARGS__)
+#define ARM_COMPUTE_ERROR_ON_NULLPTR(...) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_nullptr(__func__, __FILE__, __LINE__, __VA_ARGS__))
+#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
-/** Throw an error if the passed window is invalid.
+/** Return an error if the passed window is invalid.
  *
  * The subwindow is invalid if:
  * - It is not a valid window.
  * - Its dimensions don't match the full window's ones
  * - The step for each of its dimension is not identical to the corresponding one of the full window.
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] full     Full size window
- *  @param[in] win      Window to validate.
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] full     Full size window
+ * @param[in] win      Window to validate.
+ *
+ * @return Status
  */
-void error_on_mismatching_windows(const char *function, const char *file, const int line,
-                                  const Window &full, const Window &win);
-#define ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(f, w) ::arm_compute::error_on_mismatching_windows(__func__, __FILE__, __LINE__, f, w)
+arm_compute::Status error_on_mismatching_windows(const char *function, const char *file, const int line,
+                                                 const Window &full, const Window &win);
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(f, w) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_windows(__func__, __FILE__, __LINE__, f, w))
+#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_WINDOWS(f, w) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_windows(__func__, __FILE__, __LINE__, f, w))
 
-/** Throw an error if the passed subwindow is invalid.
+/** Return an error if the passed subwindow is invalid.
  *
  * The subwindow is invalid if:
  * - It is not a valid window.
  * - It is not fully contained inside the full window
  * - The step for each of its dimension is not identical to the corresponding one of the full window.
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] full     Full size window
- *  @param[in] sub      Sub-window to validate.
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] full     Full size window
+ * @param[in] sub      Sub-window to validate.
+ *
+ * @return Status
  */
-void error_on_invalid_subwindow(const char *function, const char *file, const int line,
-                                const Window &full, const Window &sub);
-#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s) ::arm_compute::error_on_invalid_subwindow(__func__, __FILE__, __LINE__, f, s)
+arm_compute::Status error_on_invalid_subwindow(const char *function, const char *file, const int line,
+                                               const Window &full, const Window &sub);
+#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subwindow(__func__, __FILE__, __LINE__, f, s))
+#define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBWINDOW(f, s) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_invalid_subwindow(__func__, __FILE__, __LINE__, f, s))
 
-/** Throw an error if the window can't be collapsed at the given dimension.
+/** Return an error if the window can't be collapsed at the given dimension.
  *
  * The window cannot be collapsed if the given dimension not equal to the full window's dimension or not start from 0.
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] full     Full size window
- *  @param[in] window   Window to be collapsed.
- *  @param[in] dim      Dimension need to be checked.
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] full     Full size window
+ * @param[in] window   Window to be collapsed.
+ * @param[in] dim      Dimension need to be checked.
+ *
+ * @return Status
  */
-void error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line,
-                                                  const Window &full, const Window &window, const int dim);
-#define ARM_COMPUTE_ERROR_ON_WINDOW_NOT_COLLAPSABLE_AT_DIMENSION(f, w, d) ::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d)
+arm_compute::Status error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line,
+                                                                 const Window &full, const Window &window, const int dim);
+#define ARM_COMPUTE_ERROR_ON_WINDOW_NOT_COLLAPSABLE_AT_DIMENSION(f, w, d) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
+#define ARM_COMPUTE_RETURN_ERROR_ON_WINDOW_NOT_COLLAPSABLE_AT_DIMENSION(f, w, d) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
 
-/** Throw an error if the passed coordinates have too many dimensions.
+/** Return an error if the passed coordinates have too many dimensions.
  *
  * The coordinates have too many dimensions if any of the dimensions greater or equal to max_dim is different from 0.
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] pos      Coordinates to validate
- *  @param[in] max_dim  Maximum number of dimensions allowed.
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] pos      Coordinates to validate
+ * @param[in] max_dim  Maximum number of dimensions allowed.
+ *
+ * @return Status
  */
-void error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
-                                         const Coordinates &pos, unsigned int max_dim);
-#define ARM_COMPUTE_ERROR_ON_COORDINATES_DIMENSIONS_GTE(p, md) ::arm_compute::error_on_coordinates_dimensions_gte(__func__, __FILE__, __LINE__, p, md)
+arm_compute::Status error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
+                                                        const Coordinates &pos, unsigned int max_dim);
+#define ARM_COMPUTE_ERROR_ON_COORDINATES_DIMENSIONS_GTE(p, md) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_coordinates_dimensions_gte(__func__, __FILE__, __LINE__, p, md))
+#define ARM_COMPUTE_RETURN_ERROR_ON_COORDINATES_DIMENSIONS_GTE(p, md) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_coordinates_dimensions_gte(__func__, __FILE__, __LINE__, p, md))
 
-/** Throw an error if the passed window has too many dimensions.
+/** Return an error if the passed window has too many dimensions.
  *
  * The window has too many dimensions if any of the dimension greater or equal to max_dim is different from 0.
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] win      Window to validate
- *  @param[in] max_dim  Maximum number of dimensions allowed.
- */
-void error_on_window_dimensions_gte(const char *function, const char *file, const int line,
-                                    const Window &win, unsigned int max_dim);
-#define ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(w, md) ::arm_compute::error_on_window_dimensions_gte(__func__, __FILE__, __LINE__, w, md)
-
-/** Throw an error if the passed dimension objects differ.
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] win      Window to validate
+ * @param[in] max_dim  Maximum number of dimensions allowed.
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] dim1     The first object to be compared.
- *  @param[in] dim2     The second object to be compared.
- *  @param[in] dims     (Optional) Further allowed objects.
+ * @return Status
+ */
+arm_compute::Status error_on_window_dimensions_gte(const char *function, const char *file, const int line,
+                                                   const Window &win, unsigned int max_dim);
+#define ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(w, md) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_window_dimensions_gte(__func__, __FILE__, __LINE__, w, md))
+#define ARM_COMPUTE_RETURN_ERROR_ON_WINDOW_DIMENSIONS_GTE(w, md) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_window_dimensions_gte(__func__, __FILE__, __LINE__, w, md))
+
+/** Return an error if the passed dimension objects differ.
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] dim1     The first object to be compared.
+ * @param[in] dim2     The second object to be compared.
+ * @param[in] dims     (Optional) Further allowed objects.
+ *
+ * @return Status
  */
 template <typename T, typename... Ts>
-void error_on_mismatching_dimensions(const char *function, const char *file, int line,
-                                     const Dimensions<T> &dim1, const Dimensions<T> &dim2, Ts &&... dims)
+arm_compute::Status error_on_mismatching_dimensions(const char *function, const char *file, int line,
+                                                    const Dimensions<T> &dim1, const Dimensions<T> &dim2, Ts &&... dims)
 {
-    ARM_COMPUTE_UNUSED(function);
-    ARM_COMPUTE_UNUSED(file);
-    ARM_COMPUTE_UNUSED(line);
-
-    for_each(detail::compare_dimension<T>(dim1, function, file, line), dim2, std::forward<Ts>(dims)...);
+    ARM_COMPUTE_RETURN_ON_ERROR(detail::for_each_error(detail::compare_dimension<T>(dim1, function, file, line), dim2, std::forward<Ts>(dims)...));
+    return arm_compute::Status{};
 }
-#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(...) ::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__)
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(...) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
+#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
-/** Throw an error if the passed two tensors have different shapes from the given dimension
+/** Return an error if the passed two tensor infos have different shapes from the given dimension
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] tensor_1 The first tensor to be compared.
- *  @param[in] tensor_2 The second tensor to be compared.
- *  @param[in] tensors  (Optional) Further allowed tensors.
+ * @param[in] function      Function in which the error occurred.
+ * @param[in] file          Name of the file where the error occurred.
+ * @param[in] line          Line on which the error occurred.
+ * @param[in] tensor_info_1 The first tensor info to be compared.
+ * @param[in] tensor_info_2 The second tensor info to be compared.
+ * @param[in] tensor_infos  (Optional) Further allowed tensor infos.
+ *
+ * @return Status
  */
 template <typename... Ts>
-void error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                 const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
+                                                       const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
 {
-    error_on_mismatching_shapes(function, file, line, 0U, tensor_1, tensor_2, std::forward<Ts>(tensors)...);
+    return error_on_mismatching_shapes(function, file, line, 0U, tensor_info_1, tensor_info_2, std::forward<Ts>(tensor_infos)...);
 }
-
-/** Throw an error if the passed two tensors have different shapes from the given dimension
+/** Return an error if the passed two tensors have different shapes from the given dimension
  *
- *  @param[in] function  Function in which the error occurred.
- *  @param[in] file      Name of the file where the error occurred.
- *  @param[in] line      Line on which the error occurred.
- *  @param[in] upper_dim The dimension from which to check.
- *  @param[in] tensor_1  The first tensor to be compared.
- *  @param[in] tensor_2  The second tensor to be compared.
- *  @param[in] tensors   (Optional) Further allowed tensors.
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] tensor_1 The first tensor to be compared.
+ * @param[in] tensor_2 The second tensor to be compared.
+ * @param[in] tensors  (Optional) Further allowed tensors.
+ *
+ * @return Status
  */
 template <typename... Ts>
-void error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                 unsigned int upper_dim, const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
+                                                       const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
 {
-    ARM_COMPUTE_UNUSED(function);
-    ARM_COMPUTE_UNUSED(file);
-    ARM_COMPUTE_UNUSED(line);
-
-    const std::array < const ITensor *, 2 + sizeof...(Ts) > tensors_array{ { tensor_1, tensor_2, std::forward<Ts>(tensors)... } };
-    ARM_COMPUTE_UNUSED(tensors_array);
-
-    ARM_COMPUTE_ERROR_ON_LOC(tensors_array.cbegin() == nullptr, function, file, line);
+    return error_on_mismatching_shapes(function, file, line, 0U, tensor_1, tensor_2, std::forward<Ts>(tensors)...);
+}
+/** Return an error if the passed two tensors have different shapes from the given dimension
+ *
+ * @param[in] function      Function in which the error occurred.
+ * @param[in] file          Name of the file where the error occurred.
+ * @param[in] line          Line on which the error occurred.
+ * @param[in] upper_dim     The dimension from which to check.
+ * @param[in] tensor_info_1 The first tensor info to be compared.
+ * @param[in] tensor_info_2 The second tensor info to be compared.
+ * @param[in] tensor_infos  (Optional) Further allowed tensor infos.
+ *
+ * @return Status
+ */
+template <typename... Ts>
+inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
+                                                       unsigned int upper_dim, const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info_1 == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info_2 == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensor_infos)...));
 
-    ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(std::next(tensors_array.cbegin()), tensors_array.cend(), [&](const ITensor * tensor)
+    const std::array < const ITensorInfo *, 2 + sizeof...(Ts) > tensors_info_array{ { tensor_info_1, tensor_info_2, std::forward<Ts>(tensor_infos)... } };
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(std::next(tensors_info_array.cbegin()), tensors_info_array.cend(), [&](const ITensorInfo * tensor_info)
     {
-        ARM_COMPUTE_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-        return detail::have_different_dimensions((*tensors_array.cbegin())->info()->tensor_shape(), tensor->info()->tensor_shape(), upper_dim);
+        return detail::have_different_dimensions((*tensors_info_array.cbegin())->tensor_shape(), tensor_info->tensor_shape(), upper_dim);
     }),
     function, file, line, "Tensors have different shapes");
+    return arm_compute::Status{};
 }
-#define ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(...) ::arm_compute::error_on_mismatching_shapes(__func__, __FILE__, __LINE__, __VA_ARGS__)
-
-/** Throw an error if the passed two tensors have different data types
+/** Return an error if the passed two tensors have different shapes from the given dimension
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] tensor   The first tensor to be compared.
- *  @param[in] tensors  (Optional) Further allowed tensors.
+ * @param[in] function  Function in which the error occurred.
+ * @param[in] file      Name of the file where the error occurred.
+ * @param[in] line      Line on which the error occurred.
+ * @param[in] upper_dim The dimension from which to check.
+ * @param[in] tensor_1  The first tensor to be compared.
+ * @param[in] tensor_2  The second tensor to be compared.
+ * @param[in] tensors   (Optional) Further allowed tensors.
+ *
+ * @return Status
  */
 template <typename... Ts>
-void error_on_mismatching_data_types(const char *function, const char *file, const int line,
-                                     const ITensor *tensor, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
+                                                       unsigned int upper_dim, const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
 {
-    ARM_COMPUTE_UNUSED(function);
-    ARM_COMPUTE_UNUSED(file);
-    ARM_COMPUTE_UNUSED(line);
-    ARM_COMPUTE_UNUSED(tensor);
-
-    ARM_COMPUTE_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-
-    DataType &&tensor_data_type = tensor->info()->data_type();
-    ARM_COMPUTE_UNUSED(tensor_data_type);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_1 == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_2 == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_shapes(function, file, line, upper_dim, tensor_1->info(), tensor_2->info(),
+                                                                           detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    return arm_compute::Status{};
+}
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(...) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_shapes(__func__, __FILE__, __LINE__, __VA_ARGS__))
+#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_shapes(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
-    const std::array<const ITensor *, sizeof...(Ts)> tensors_array{ { std::forward<Ts>(tensors)... } };
-    ARM_COMPUTE_UNUSED(tensors_array);
+/** Return an error if the passed two tensor infos have different data types
+ *
+ * @param[in] function     Function in which the error occurred.
+ * @param[in] file         Name of the file where the error occurred.
+ * @param[in] line         Line on which the error occurred.
+ * @param[in] tensor_info  The first tensor info to be compared.
+ * @param[in] tensor_infos (Optional) Further allowed tensor infos.
+ *
+ * @return Status
+ */
+template <typename... Ts>
+inline arm_compute::Status error_on_mismatching_data_types(const char *function, const char *file, const int line,
+                                                           const ITensorInfo *tensor_info, Ts... tensor_infos)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensor_infos)...));
 
-    ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor_obj)
+    DataType &&tensor_data_type = tensor_info->data_type();
+    const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{ { std::forward<Ts>(tensor_infos)... } };
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(), [&](const ITensorInfo * tensor_info_obj)
     {
-        ARM_COMPUTE_ERROR_ON_LOC(tensor_obj == nullptr, function, file, line);
-        return tensor_obj->info()->data_type() != tensor_data_type;
+        return tensor_info_obj->data_type() != tensor_data_type;
     }),
     function, file, line, "Tensors have different data types");
+    return arm_compute::Status{};
 }
+/** Return an error if the passed two tensors have different data types
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] tensor   The first tensor to be compared.
+ * @param[in] tensors  (Optional) Further allowed tensors.
+ *
+ * @return Status
+ */
+template <typename... Ts>
+inline arm_compute::Status error_on_mismatching_data_types(const char *function, const char *file, const int line,
+                                                           const ITensor *tensor, Ts... tensors)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(function, file, line, tensor->info(),
+                                                                               detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    return arm_compute::Status{};
+}
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(...) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
+#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
-#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(...) ::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__)
-
-/** Throw an error if the passed tensors have different fixed point data types or different fixed point positions
+/** Return an error if the passed tensor infos have different fixed point data types or different fixed point positions
  *
  * @note: If the first tensor doesn't have fixed point data type, the function returns without throwing an error
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] tensor_1 The first tensor to be compared.
- *  @param[in] tensor_2 The second tensor to be compared.
- *  @param[in] tensors  (Optional) Further allowed tensors.
+ * @param[in] function      Function in which the error occurred.
+ * @param[in] file          Name of the file where the error occurred.
+ * @param[in] line          Line on which the error occurred.
+ * @param[in] tensor_info_1 The first tensor info to be compared.
+ * @param[in] tensor_info_2 The second tensor info to be compared.
+ * @param[in] tensor_infos  (Optional) Further allowed tensor infos.
+ *
+ * @return Status
  */
 template <typename... Ts>
-void error_on_mismatching_fixed_point(const char *function, const char *file, const int line,
-                                      const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_fixed_point(const char *function, const char *file, const int line,
+                                                            const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
 {
-    ARM_COMPUTE_UNUSED(function);
-    ARM_COMPUTE_UNUSED(file);
-    ARM_COMPUTE_UNUSED(line);
-    ARM_COMPUTE_UNUSED(tensor_1);
-    ARM_COMPUTE_UNUSED(tensor_2);
-
-    DataType &&first_data_type            = tensor_1->info()->data_type();
-    const int  first_fixed_point_position = tensor_1->info()->fixed_point_position();
-    ARM_COMPUTE_UNUSED(first_data_type);
-    ARM_COMPUTE_UNUSED(first_fixed_point_position);
-
-    if((first_data_type != DataType::QS8) && (first_data_type != DataType::QS16))
+    DataType &&first_data_type            = tensor_info_1->data_type();
+    const int  first_fixed_point_position = tensor_info_1->fixed_point_position();
+
+    if(!is_data_type_fixed_point(first_data_type))
     {
-        return;
+        return arm_compute::Status{};
     }
 
-    const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_array{ { tensor_2, std::forward<Ts>(tensors)... } };
-    ARM_COMPUTE_UNUSED(tensors_array);
-
-    ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor)
+    const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } };
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
     {
-        return tensor->info()->data_type() != first_data_type;
+        return tensor_info->data_type() != first_data_type;
     }),
     function, file, line, "Tensors have different fixed point data types");
-
-    ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor)
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
     {
-        return tensor->info()->fixed_point_position() != first_fixed_point_position;
+        return tensor_info->fixed_point_position() != first_fixed_point_position;
     }),
     function, file, line, "Tensors have different fixed point positions");
+
+    return arm_compute::Status{};
 }
+/** Return an error if the passed tensor have different fixed point data types or different fixed point positions
+ *
+ * @note: If the first tensor doesn't have fixed point data type, the function returns without throwing an error
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] tensor_1 The first tensor to be compared.
+ * @param[in] tensor_2 The second tensor to be compared.
+ * @param[in] tensors  (Optional) Further allowed tensors.
+ *
+ * @return Status
+ */
+template <typename... Ts>
+inline arm_compute::Status error_on_mismatching_fixed_point(const char *function, const char *file, const int line,
+                                                            const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_fixed_point(function, file, line, tensor_1->info(), tensor_2->info(),
+                                                                                detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    return arm_compute::Status{};
+}
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(...) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__))
+#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(...) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__))
+
+/** Return an error if the passed tensor infos have different asymmetric quantized data types or different quantization info
+ *
+ * @note: If the first tensor info doesn't have asymmetric quantized data type, the function returns without throwing an error
+ *
+ * @param[in] function      Function in which the error occurred.
+ * @param[in] file          Name of the file where the error occurred.
+ * @param[in] line          Line on which the error occurred.
+ * @param[in] tensor_info_1 The first tensor info to be compared.
+ * @param[in] tensor_info_2 The second tensor info to be compared.
+ * @param[in] tensor_infos  (Optional) Further allowed tensor infos.
+ *
+ * @return Status
+ */
+template <typename... Ts>
+inline arm_compute::Status error_on_mismatching_quantization_info(const char *function, const char *file, const int line,
+                                                                  const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+{
+    DataType             &&first_data_type         = tensor_info_1->data_type();
+    const QuantizationInfo first_quantization_info = tensor_info_1->quantization_info();
+
+    if(!is_data_type_quantized_asymmetric(first_data_type))
+    {
+        return arm_compute::Status{};
+    }
 
-#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(...) ::arm_compute::error_on_mismatching_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__)
+    const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } };
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
+    {
+        return tensor_info->data_type() != first_data_type;
+    }),
+    function, file, line, "Tensors have different asymmetric quantized data types");
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
+    {
+        return tensor_info->quantization_info() != first_quantization_info;
+    }),
+    function, file, line, "Tensors have different quantization information");
+
+    return arm_compute::Status{};
+}
+/** Return an error if the passed tensor have different asymmetric quantized data types or different quantization info
+ *
+ * @note: If the first tensor doesn't have asymmetric quantized data type, the function returns without throwing an error
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] tensor_1 The first tensor to be compared.
+ * @param[in] tensor_2 The second tensor to be compared.
+ * @param[in] tensors  (Optional) Further allowed tensors.
+ *
+ * @return Status
+ */
+template <typename... Ts>
+inline arm_compute::Status error_on_mismatching_quantization_info(const char *function, const char *file, const int line,
+                                                                  const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_quantization_info(function, file, line, tensor_1->info(), tensor_2->info(),
+                                                                                      detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    return arm_compute::Status{};
+}
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
+#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Throw an error if the format of the passed tensor/multi-image does not match any of the formats provided.
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] object   Tensor/multi-image to validate.
- *  @param[in] format   First format allowed.
- *  @param[in] formats  (Optional) Further allowed formats.
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] object   Tensor/multi-image to validate.
+ * @param[in] format   First format allowed.
+ * @param[in] formats  (Optional) Further allowed formats.
  */
 template <typename T, typename F, typename... Fs>
 void error_on_format_not_in(const char *function, const char *file, const int line,
@@ -374,109 +573,168 @@ void error_on_format_not_in(const char *function, const char *file, const int li
 }
 #define ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(t, ...) ::arm_compute::error_on_format_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__)
 
-/** Throw an error if the data type of the passed tensor does not match any of the data types provided.
+/** Return an error if the data type of the passed tensor info does not match any of the data types provided.
+ *
+ * @param[in] function    Function in which the error occurred.
+ * @param[in] file        Name of the file where the error occurred.
+ * @param[in] line        Line on which the error occurred.
+ * @param[in] tensor_info Tensor info to validate.
+ * @param[in] dt          First data type allowed.
+ * @param[in] dts         (Optional) Further allowed data types.
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] tensor   Tensor to validate.
- *  @param[in] dt       First data type allowed.
- *  @param[in] dts      (Optional) Further allowed data types.
+ * @return Status
  */
 template <typename T, typename... Ts>
-void error_on_data_type_not_in(const char *function, const char *file, const int line,
-                               const ITensor *tensor, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_not_in(const char *function, const char *file, const int line,
+                                                     const ITensorInfo *tensor_info, T &&dt, Ts &&... dts)
 {
-    ARM_COMPUTE_ERROR_ON_LOC(tensor == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
 
-    const DataType &tensor_dt = tensor->info()->data_type(); //NOLINT
-    ARM_COMPUTE_UNUSED(tensor_dt);
-
-    ARM_COMPUTE_ERROR_ON_LOC(tensor_dt == DataType::UNKNOWN, function, file, line);
+    const DataType &tensor_dt = tensor_info->data_type(); //NOLINT
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_dt == DataType::UNKNOWN, function, file, line);
 
     const std::array<T, sizeof...(Ts)> dts_array{ { std::forward<Ts>(dts)... } };
-    ARM_COMPUTE_UNUSED(dts_array);
-
-    ARM_COMPUTE_ERROR_ON_LOC_MSG(tensor_dt != dt && std::none_of(dts_array.begin(), dts_array.end(), [&](const T & d)
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(tensor_dt != dt && std::none_of(dts_array.begin(), dts_array.end(), [&](const T & d)
     {
         return d == tensor_dt;
     }),
     function, file, line, "ITensor data type %s not supported by this kernel", string_from_data_type(tensor_dt).c_str());
+    return arm_compute::Status{};
 }
-#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(t, ...) ::arm_compute::error_on_data_type_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__)
-
-/** Throw an error if the data type or the number of channels of the passed tensor does not match any of the data types and number of channels provided.
+/** Return an error if the data type of the passed tensor does not match any of the data types provided.
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] tensor   Tensor to validate.
+ * @param[in] dt       First data type allowed.
+ * @param[in] dts      (Optional) Further allowed data types.
  *
- *  @param[in] function     Function in which the error occurred.
- *  @param[in] file         Name of the file where the error occurred.
- *  @param[in] line         Line on which the error occurred.
- *  @param[in] tensor       Tensor to validate.
- *  @param[in] num_channels Number of channels to check
- *  @param[in] dt           First data type allowed.
- *  @param[in] dts          (Optional) Further allowed data types.
+ * @return Status
  */
 template <typename T, typename... Ts>
-void error_on_data_type_channel_not_in(const char *function, const char *file, const int line,
-                                       const ITensor *tensor, size_t num_channels, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_not_in(const char *function, const char *file, const int line,
+                                                     const ITensor *tensor, T &&dt, Ts &&... dts)
 {
-    error_on_data_type_not_in(function, file, line, tensor, std::forward<T>(dt), std::forward<Ts>(dts)...);
-
-    const size_t tensor_nc = tensor->info()->num_channels();
-    ARM_COMPUTE_UNUSED(tensor_nc);
-
-    ARM_COMPUTE_ERROR_ON_LOC_MSG(tensor_nc != num_channels, function, file, line, "Number of channels %d. Required number of channels %d", tensor_nc, num_channels);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(function, file, line, tensor->info(), std::forward<T>(dt), std::forward<Ts>(dts)...));
+    return arm_compute::Status{};
 }
-#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c, ...) ::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__)
+#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(t, ...) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_data_type_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__))
+#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(t, ...) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__))
 
-/** Throw an error if the tensor is not 2D.
+/** Return an error if the data type or the number of channels of the passed tensor info does not match any of the data types and number of channels provided.
+ *
+ * @param[in] function     Function in which the error occurred.
+ * @param[in] file         Name of the file where the error occurred.
+ * @param[in] line         Line on which the error occurred.
+ * @param[in] tensor_info  Tensor info to validate.
+ * @param[in] num_channels Number of channels to check
+ * @param[in] dt           First data type allowed.
+ * @param[in] dts          (Optional) Further allowed data types.
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] tensor   Tensor to validate.
+ * @return Status
  */
-void error_on_tensor_not_2d(const char *function, const char *file, const int line,
-                            const ITensor *tensor);
-#define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t) ::arm_compute::error_on_tensor_not_2d(__func__, __FILE__, __LINE__, t)
+template <typename T, typename... Ts>
+inline arm_compute::Status error_on_data_type_channel_not_in(const char *function, const char *file, const int line,
+                                                             const ITensorInfo *tensor_info, size_t num_channels, T &&dt, Ts &&... dts)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(function, file, line, tensor_info, std::forward<T>(dt), std::forward<Ts>(dts)...));
+    const size_t tensor_nc = tensor_info->num_channels();
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(tensor_nc != num_channels, function, file, line, "Number of channels %d. Required number of channels %d", tensor_nc, num_channels);
+    return arm_compute::Status{};
+}
+/** Return an error if the data type or the number of channels of the passed tensor does not match any of the data types and number of channels provided.
+ *
+ * @param[in] function     Function in which the error occurred.
+ * @param[in] file         Name of the file where the error occurred.
+ * @param[in] line         Line on which the error occurred.
+ * @param[in] tensor       Tensor to validate.
+ * @param[in] num_channels Number of channels to check
+ * @param[in] dt           First data type allowed.
+ * @param[in] dts          (Optional) Further allowed data types.
+ *
+ * @return Status
+ */
+template <typename T, typename... Ts>
+inline arm_compute::Status error_on_data_type_channel_not_in(const char *function, const char *file, const int line,
+                                                             const ITensor *tensor, size_t num_channels, T &&dt, Ts &&... dts)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ON_ERROR(error_on_data_type_channel_not_in(function, file, line, tensor->info(), num_channels, std::forward<T>(dt), std::forward<Ts>(dts)...));
+    return arm_compute::Status{};
+}
+#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c, ...) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
+#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c, ...) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
 
-/** Throw an error if the channel is not in channels.
+/** Return an error if the tensor is not 2D.
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] tensor   Tensor to validate.
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] cn       Input channel
- *  @param[in] channel  First channel allowed.
- *  @param[in] channels (Optional) Further allowed channels.
+ * @return Status
+ */
+arm_compute::Status error_on_tensor_not_2d(const char *function, const char *file, const int line,
+                                           const ITensor *tensor);
+#define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_tensor_not_2d(__func__, __FILE__, __LINE__, t))
+#define ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(t) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_tensor_not_2d(__func__, __FILE__, __LINE__, t))
+
+/** Return an error if the channel is not in channels.
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] cn       Input channel
+ * @param[in] channel  First channel allowed.
+ * @param[in] channels (Optional) Further allowed channels.
+ *
+ * @return Status
  */
 template <typename T, typename... Ts>
-void error_on_channel_not_in(const char *function, const char *file, const int line,
-                             T cn, T &&channel, Ts &&... channels)
+inline arm_compute::Status error_on_channel_not_in(const char *function, const char *file, const int line,
+                                                   T cn, T &&channel, Ts &&... channels)
 {
-    ARM_COMPUTE_ERROR_ON_LOC(cn == Channel::UNKNOWN, function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(cn == Channel::UNKNOWN, function, file, line);
 
     const std::array<T, sizeof...(Ts)> channels_array{ { std::forward<Ts>(channels)... } };
-    ARM_COMPUTE_UNUSED(channels_array);
-    ARM_COMPUTE_ERROR_ON_LOC(channel != cn && std::none_of(channels_array.begin(), channels_array.end(), [&](const T & f)
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(channel != cn && std::none_of(channels_array.begin(), channels_array.end(), [&](const T & f)
     {
         return f == cn;
     }),
     function, file, line);
+    return arm_compute::Status{};
 }
-#define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN(c, ...) ::arm_compute::error_on_channel_not_in(__func__, __FILE__, __LINE__, c, __VA_ARGS__)
+#define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN(c, ...) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_channel_not_in(__func__, __FILE__, __LINE__, c, __VA_ARGS__))
+#define ARM_COMPUTE_RETURN_ERROR_ON_CHANNEL_NOT_IN(c, ...) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_channel_not_in(__func__, __FILE__, __LINE__, c, __VA_ARGS__))
 
-/** Throw an error if the channel is not in format.
+/** Return an error if the channel is not in format.
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] fmt      Input channel
+ * @param[in] cn       First channel allowed.
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] fmt      Input channel
- *  @param[in] cn       First channel allowed.
+ * @return Status
  */
-void error_on_channel_not_in_known_format(const char *function, const char *file, const int line,
-                                          Format fmt, Channel cn);
-#define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(f, c) ::arm_compute::error_on_channel_not_in_known_format(__func__, __FILE__, __LINE__, f, c)
+arm_compute::Status error_on_channel_not_in_known_format(const char *function, const char *file, const int line,
+                                                         Format fmt, Channel cn);
+#define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(f, c) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_channel_not_in_known_format(__func__, __FILE__, __LINE__, f, c))
+#define ARM_COMPUTE_RETURN_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(f, c) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_channel_not_in_known_format(__func__, __FILE__, __LINE__, f, c))
 
-/** Throw an error if the @ref IMultiHOG container is invalid
+/** Return an error if the @ref IMultiHOG container is invalid
  *
  * An @ref IMultiHOG container is invalid if:
  *
@@ -484,27 +742,35 @@ void error_on_channel_not_in_known_format(const char *function, const char *file
  * -# it doesn't contain models
  * -# it doesn't have the HOG data objects with the same phase_type, normalization_type and l2_hyst_threshold (if normalization_type == L2HYS_NORM)
  *
- *  @param[in] function  Function in which the error occurred.
- *  @param[in] file      Name of the file where the error occurred.
- *  @param[in] line      Line on which the error occurred.
- *  @param[in] multi_hog IMultiHOG container to validate
- */
-void error_on_invalid_multi_hog(const char *function, const char *file, const int line,
-                                const IMultiHOG *multi_hog);
-#define ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(m) ::arm_compute::error_on_invalid_multi_hog(__func__, __FILE__, __LINE__, m)
-
-/** Throw an error if the kernel is not configured.
+ * @param[in] function  Function in which the error occurred.
+ * @param[in] file      Name of the file where the error occurred.
+ * @param[in] line      Line on which the error occurred.
+ * @param[in] multi_hog IMultiHOG container to validate
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] kernel   Kernel to validate.
+ * @return Status
  */
-void error_on_unconfigured_kernel(const char *function, const char *file, const int line,
-                                  const IKernel *kernel);
-#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k) ::arm_compute::error_on_unconfigured_kernel(__func__, __FILE__, __LINE__, k)
+arm_compute::Status error_on_invalid_multi_hog(const char *function, const char *file, const int line,
+                                               const IMultiHOG *multi_hog);
+#define ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(m) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_multi_hog(__func__, __FILE__, __LINE__, m))
+#define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_MULTI_HOG(m) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_invalid_multi_hog(__func__, __FILE__, __LINE__, m))
+
+/** Return an error if the kernel is not configured.
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] kernel   Kernel to validate.
+ */
+arm_compute::Status error_on_unconfigured_kernel(const char *function, const char *file, const int line,
+                                                 const IKernel *kernel);
+#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unconfigured_kernel(__func__, __FILE__, __LINE__, k))
+#define ARM_COMPUTE_RETURN_ERROR_ON_UNCONFIGURED_KERNEL(k) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unconfigured_kernel(__func__, __FILE__, __LINE__, k))
 
-/** Throw an error if if the coordinates and shape of the subtensor are within the parent tensor.
+/** Return an error if if the coordinates and shape of the subtensor are within the parent tensor.
  *
  * @param[in] function     Function in which the error occurred.
  * @param[in] file         Name of the file where the error occurred.
@@ -512,68 +778,122 @@ void error_on_unconfigured_kernel(const char *function, const char *file, const
  * @param[in] parent_shape Parent tensor shape
  * @param[in] coords       Coordinates inside the parent tensor where the first element of the subtensor is
  * @param[in] shape        Shape of the subtensor
+ *
+ * @return Status
  */
-void error_on_invalid_subtensor(const char *function, const char *file, const int line,
-                                const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape);
-#define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(p, c, s) ::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, p, c, s)
+arm_compute::Status error_on_invalid_subtensor(const char *function, const char *file, const int line,
+                                               const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape);
+#define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(p, c, s) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, p, c, s))
+#define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBTENSOR(p, c, s) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, p, c, s))
 
-/** Throw an error if the valid region of a subtensor is not inside the valid region of the parent tensor.
+/** Return an error if the valid region of a subtensor is not inside the valid region of the parent tensor.
  *
  * @param[in] function            Function in which the error occurred.
  * @param[in] file                Name of the file where the error occurred.
  * @param[in] line                Line on which the error occurred.
  * @param[in] parent_valid_region Parent valid region.
  * @param[in] valid_region        Valid region of subtensor.
- */
-void error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
-                                             const ValidRegion &parent_valid_region, const ValidRegion &valid_region);
-#define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) ::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv)
-
-/** Throw an error if the input fixed-point positions are different.
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] tensor_1 The first tensor to be compared.
- *  @param[in] tensor_2 The second tensor to be compared.
- *  @param[in] tensors  (Optional) Further allowed tensors.
+ * @return Status
+ */
+arm_compute::Status error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
+                                                            const ValidRegion &parent_valid_region, const ValidRegion &valid_region);
+#define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
+#define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
+
+/** Return an error if the input fixed-point positions are different.
+ *
+ * @param[in] function      Function in which the error occurred.
+ * @param[in] file          Name of the file where the error occurred.
+ * @param[in] line          Line on which the error occurred.
+ * @param[in] tensor_info_1 The first tensor info to be compared.
+ * @param[in] tensor_info_2 The second tensor info to be compared.
+ * @param[in] tensor_infos  (Optional) Further allowed tensor infos.
+ *
+ * @return Status
  */
 template <typename... Ts>
-void error_on_mismatching_fixed_point_position(const char *function, const char *file, const int line,
-                                               const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_fixed_point_position(const char *function, const char *file, const int line,
+                                                                     const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
 {
-    const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_array{ { tensor_2, std::forward<Ts>(tensors)... } };
-    ARM_COMPUTE_UNUSED(tensors_array);
-
-    ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor)
+    const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_info_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } };
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_info_array.begin(), tensor_info_array.end(), [&](const ITensorInfo * tensor_info)
     {
-        return tensor->info()->fixed_point_position() != tensor_1->info()->fixed_point_position();
+        return tensor_info->fixed_point_position() != tensor_info_1->fixed_point_position();
     }),
     function, file, line, "Tensors have different fixed-point positions");
+    return arm_compute::Status{};
 }
-#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(...) ::arm_compute::error_on_mismatching_fixed_point_position(__func__, __FILE__, __LINE__, __VA_ARGS__)
-
-/** Throw an error if the fixed-point value is not representable in the specified Q format.
+/** Return an error if the input fixed-point positions are different.
  *
- *  @param[in] function Function in which the error occurred.
- *  @param[in] file     Name of the file where the error occurred.
- *  @param[in] line     Line on which the error occurred.
- *  @param[in] value    The floating point value to be checked.
- *  @param[in] tensor   Input tensor that has information on data type and fixed-point position.
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] tensor_1 The first tensor to be compared.
+ * @param[in] tensor_2 The second tensor to be compared.
+ * @param[in] tensors  (Optional) Further allowed tensors.
+ *
+ * @return Status
  */
 template <typename... Ts>
-void error_on_value_not_representable_in_fixed_point(const char *function, const char *file, int line,
-                                                     float value, const ITensor *tensor)
+inline arm_compute::Status error_on_mismatching_fixed_point_position(const char *function, const char *file, const int line,
+                                                                     const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
 {
-    const int          fixed_point_position = tensor->info()->fixed_point_position();
-    const DataType     dt                   = tensor->info()->data_type();
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_fixed_point_position(function, file, line, tensor_1->info(), tensor_2->info(),
+                                                                                         detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    return arm_compute::Status{};
+}
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(...) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_fixed_point_position(__func__, __FILE__, __LINE__, __VA_ARGS__))
+#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(...) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_fixed_point_position(__func__, __FILE__, __LINE__, __VA_ARGS__))
+
+/** Return an error if the fixed-point value is not representable in the specified Q format.
+ *
+ * @param[in] function    Function in which the error occurred.
+ * @param[in] file        Name of the file where the error occurred.
+ * @param[in] line        Line on which the error occurred.
+ * @param[in] value       The floating point value to be checked.
+ * @param[in] tensor_info Input tensor info that has information on data type and fixed-point position.
+ *
+ * @return Status
+ */
+inline arm_compute::Status error_on_value_not_representable_in_fixed_point(const char *function, const char *file, int line,
+                                                                           float value, const ITensorInfo *tensor_info)
+{
+    const int          fixed_point_position = tensor_info->fixed_point_position();
+    const DataType     dt                   = tensor_info->data_type();
     const unsigned int q_max_range          = 0xFFFFFFFFu >> (((sizeof(unsigned int) - element_size_from_data_type(dt)) * 8) + 1);
     const float        max_range            = q_max_range / (static_cast<float>(1 << fixed_point_position));
-    ARM_COMPUTE_UNUSED(max_range);
 
-    ARM_COMPUTE_ERROR_ON_LOC_MSG(value > max_range, function, file, line,
-                                 "Value %f is not representable in %s with fixed-point position %d", value, string_from_data_type(dt).c_str(), fixed_point_position);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(value > max_range, function, file, line,
+                                        "Value %f is not representable in %s with fixed-point position %d", value, string_from_data_type(dt).c_str(), fixed_point_position);
+    return arm_compute::Status{};
+}
+/** Return an error an error if the fixed-point value is not representable in the specified Q format.
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] value    The floating point value to be checked.
+ * @param[in] tensor   Input tensor that has information on data type and fixed-point position.
+ *
+ * @return Status
+ */
+inline arm_compute::Status error_on_value_not_representable_in_fixed_point(const char *function, const char *file, int line,
+                                                                           float value, const ITensor *tensor)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_value_not_representable_in_fixed_point(function, file, line, value, tensor->info()));
+    return arm_compute::Status{};
 }
-#define ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(...) ::arm_compute::error_on_value_not_representable_in_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__)
+#define ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(...) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_value_not_representable_in_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__))
+#define ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(...) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_value_not_representable_in_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__))
 }
 #endif /* __ARM_COMPUTE_VALIDATE_H__*/
diff --git a/arm_compute/core/Logger.h b/arm_compute/core/utils/io/FileHandler.h
index 0848479d3..d915dbe28 100644
--- a/arm_compute/core/Logger.h
+++ b/arm_compute/core/utils/io/FileHandler.h
@@ -21,51 +21,56 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#ifndef __ARM_COMPUTE_IO_FILE_HANDLER_H__
+#define __ARM_COMPUTE_IO_FILE_HANDLER_H__
 
-#ifndef __ARM_COMPUTE_LOGGER_H__
-#define __ARM_COMPUTE_LOGGER_H__
-
-#include <iostream>
-#include <memory>
-
-#ifdef ARM_COMPUTE_DEBUG_ENABLED
-#define ARM_COMPUTE_LOG(x) (arm_compute::Logger::get().log_info() << x)
-#else /* ARM_COMPUTE_DEBUG_ENABLED */
-#define ARM_COMPUTE_LOG(...)
-#endif /* ARM_COMPUTE_DEBUG_ENABLED */
+#include <fstream>
+#include <string>
 
 namespace arm_compute
 {
-/**< Verbosity of the logger */
-enum class LoggerVerbosity
+namespace io
 {
-    NONE, /**< No info */
-    INFO  /**< Log info */
-};
-
-/** Logger singleton class */
-class Logger
+/** File Handling interface */
+class FileHandler
 {
 public:
-    static Logger &get();
-    void set_logger(std::ostream &ostream, LoggerVerbosity verbosity);
-    std::ostream &log_info();
-
-private:
-    /** Default constructor */
-    Logger();
+    /** Default Constructor */
+    FileHandler();
+    /** Default Destructor */
+    ~FileHandler();
     /** Allow instances of this class to be moved */
-    Logger(Logger &&) = default;
+    FileHandler(FileHandler &&) = default;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    Logger(const Logger &) = delete;
+    FileHandler(const FileHandler &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    Logger &operator=(const Logger &) = delete;
+    FileHandler &operator=(const FileHandler &) = delete;
     /** Allow instances of this class to be moved */
-    Logger &operator=(Logger &&) = default;
+    FileHandler &operator=(FileHandler &&) = default;
+    /** Opens file
+     *
+     * @param[in] filename File name
+     * @param[in] mode     File open mode
+     */
+    void open(const std::string &filename, std::ios_base::openmode mode);
+    /** Closes file */
+    void close();
+    /** Returns the file stream
+     *
+     * @return File stream
+     */
+    std::fstream &stream();
+    /** Returns filename of the handled file
+     *
+     * @return File filename
+     */
+    std::string filename() const;
 
-    std::ostream   *_ostream;
-    std::ostream    _nullstream;
-    LoggerVerbosity _verbosity;
+private:
+    std::fstream            _filestream;
+    std::string             _filename;
+    std::ios_base::openmode _mode;
 };
-} // arm_compute
-#endif /* __ARM_COMPUTE_LOGGER_H__ */
-\ No newline at end of file
+} // namespace io
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_IO_FILE_HANDLER_H__ */
diff --git a/arm_compute/core/utils/logging/FilePrinter.h b/arm_compute/core/utils/logging/FilePrinter.h
new file mode 100644
index 000000000..e2ae95208
--- /dev/null
+++ b/arm_compute/core/utils/logging/FilePrinter.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LOGGING_FILE_PRINTER_H__
+#define __ARM_COMPUTE_LOGGING_FILE_PRINTER_H__
+
+#include "arm_compute/core/utils/logging/IPrinter.h"
+
+#include "arm_compute/core/utils/io/FileHandler.h"
+
+namespace arm_compute
+{
+namespace logging
+{
+/** File Printer */
+class FilePrinter final : public Printer
+{
+public:
+    /** Default Constructor
+     *
+     * @param[in] filename File name
+     */
+    FilePrinter(const std::string &filename);
+
+private:
+    // Inherited methods overridden:
+    void print_internal(const std::string &msg) override;
+
+private:
+    io::FileHandler _handler;
+};
+} // namespace logging
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_LOGGING_FILE_PRINTER_H__ */
diff --git a/arm_compute/core/utils/logging/Helpers.h b/arm_compute/core/utils/logging/Helpers.h
new file mode 100644
index 000000000..4bc54e80d
--- /dev/null
+++ b/arm_compute/core/utils/logging/Helpers.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LOGGING_HELPERS_H__
+#define __ARM_COMPUTE_LOGGING_HELPERS_H__
+
+#include "arm_compute/core/utils/logging/Types.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdio>
+#include <memory>
+#include <sstream>
+#include <string>
+
+namespace arm_compute
+{
+namespace logging
+{
+/** Create a string given a format
+ *
+ * @param[in] fmt  String format
+ * @param[in] args Arguments
+ *
+ * @return The formatted string
+ */
+template <typename... Ts>
+inline std::string string_with_format(const std::string &fmt, Ts &&... args)
+{
+    size_t size     = support::cpp11::snprintf(nullptr, 0, fmt.c_str(), args...) + 1;
+    auto   char_str = support::cpp14::make_unique<char[]>(size);
+    support::cpp11::snprintf(char_str.get(), size, fmt.c_str(), args...);
+    return std::string(char_str.get(), char_str.get() + size - 1);
+}
+/** Wraps a value with angles and returns the string
+ *
+ * @param[in] val Value to wrap
+ *
+ * @return Wrapped string
+ */
+template <typename T>
+inline std::string angle_wrap_value(const T &val)
+{
+    std::ostringstream ss;
+    ss << "[" << val << "]";
+    return ss.str();
+}
+/** Translates a given log level to a string.
+ *
+ * @param[in] log_level @ref LogLevel to be translated to string.
+ *
+ * @return The string describing the logging level.
+ */
+const std::string &string_from_log_level(LogLevel log_level);
+} // namespace logging
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_LOGGING_HELPERS_H__ */
diff --git a/arm_compute/core/utils/logging/IPrinter.h b/arm_compute/core/utils/logging/IPrinter.h
new file mode 100644
index 000000000..6b410d4d1
--- /dev/null
+++ b/arm_compute/core/utils/logging/IPrinter.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LOGGING_PRINTER_H__
+#define __ARM_COMPUTE_LOGGING_PRINTER_H__
+
+#include "support/Mutex.h"
+
+namespace arm_compute
+{
+namespace logging
+{
+/** Base printer class to be inherited by other printer classes */
+class Printer
+{
+public:
+    /** Default Constructor */
+    Printer()
+        : _mtx()
+    {
+    }
+    /** Prevent instances of this class from being copied */
+    Printer(const Printer &) = delete;
+    /** Prevent instances of this class from being copied */
+    Printer &operator=(const Printer &) = delete;
+    /** Prevent instances of this class from being moved */
+    Printer(Printer &&) = delete;
+    /** Prevent instances of this class from being moved */
+    Printer &operator=(Printer &&) = delete;
+    /** Defaults Destructor */
+    virtual ~Printer() = default;
+    /** Print message
+     *
+     * @param[in] msg Message to print
+     */
+    inline void print(const std::string &msg)
+    {
+        std::lock_guard<arm_compute::Mutex> lock(_mtx);
+        print_internal(msg);
+    }
+
+private:
+    /** Interface to be implemented by the child to print a message
+     *
+     * @param[in] msg Message to print
+     */
+    virtual void print_internal(const std::string &msg) = 0;
+
+private:
+    arm_compute::Mutex _mtx;
+};
+} // namespace logging
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_LOGGING_PRINTER_H__ */
diff --git a/arm_compute/core/utils/logging/LogMsgDecorators.h b/arm_compute/core/utils/logging/LogMsgDecorators.h
new file mode 100644
index 000000000..0ffb438be
--- /dev/null
+++ b/arm_compute/core/utils/logging/LogMsgDecorators.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LOGGING_LOG_MSG_DECORATORS_H__
+#define __ARM_COMPUTE_LOGGING_LOG_MSG_DECORATORS_H__
+
+#include "arm_compute/core/utils/logging/Helpers.h"
+#include "arm_compute/core/utils/logging/Types.h"
+
+#include <chrono>
+#include <ctime>
+#include <string>
+#include <thread>
+
+namespace arm_compute
+{
+namespace logging
+{
+/** Log message decorator interface */
+class IDecorator
+{
+public:
+    /** Default Destructor */
+    virtual ~IDecorator() = default;
+    /** Decorates log message
+     *
+     * @param[in] log_msg Log message to decorate
+     */
+    virtual void decorate(LogMsg &log_msg) = 0;
+};
+
+/** String Decorator
+ *
+ * Appends a user defined string in the log message
+ */
+class StringDecorator : public IDecorator
+{
+public:
+    /** Defaults constructor
+     *
+     * @param str Sting to append
+     */
+    StringDecorator(const std::string &str)
+        : _str(str)
+    {
+        _str = angle_wrap_value(str);
+    }
+
+    // Inherited methods overridden:
+    void decorate(LogMsg &log_msg) override
+    {
+        log_msg.raw_ += _str;
+    }
+
+private:
+    std::string _str;
+};
+
+/** Date Decorator
+ *
+ * Appends the date and time in the log message
+ */
+class DateDecorator : public IDecorator
+{
+public:
+    // Inherited methods overridden:
+    void decorate(LogMsg &log_msg) override
+    {
+        log_msg.raw_ += angle_wrap_value(get_time());
+    }
+
+private:
+    /** Gets current system local time
+     *
+     * @return Local time
+     */
+    std::string get_time()
+    {
+        auto now  = std::chrono::system_clock::now();
+        auto time = std::chrono::system_clock::to_time_t(now);
+
+        char buf[100] = { 0 };
+        std::strftime(buf, sizeof(buf), "%d-%m-%Y %I:%M:%S", std::localtime(&time));
+        return buf;
+    }
+};
+
+/** Thread ID Decorator
+ *
+ * Appends the thread ID in the log message
+ */
+class ThreadIdDecorator : public IDecorator
+{
+public:
+    // Inherited methods overridden:
+    void decorate(LogMsg &log_msg) override
+    {
+#ifndef NO_MULTI_THREADING
+        log_msg.raw_ += angle_wrap_value(std::this_thread::get_id());
+#endif /* NO_MULTI_THREADING */
+    }
+};
+
+/** Log Level Decorator
+ *
+ * Appends the logging level in the log message
+ */
+class LogLevelDecorator : public IDecorator
+{
+public:
+    // Inherited methods overridden:
+    void decorate(LogMsg &log_msg) override
+    {
+        log_msg.raw_ += angle_wrap_value(string_from_log_level(log_msg.log_level_));
+    }
+};
+} // namespace logging
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_LOGGING_LOG_MSG_DECORATORS_H__ */
diff --git a/arm_compute/core/utils/logging/Logger.h b/arm_compute/core/utils/logging/Logger.h
new file mode 100644
index 000000000..eb9bdd2e3
--- /dev/null
+++ b/arm_compute/core/utils/logging/Logger.h
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LOGGING_LOGGER_H__
+#define __ARM_COMPUTE_LOGGING_LOGGER_H__
+
+#include "arm_compute/core/utils/logging/Helpers.h"
+#include "arm_compute/core/utils/logging/IPrinter.h"
+#include "arm_compute/core/utils/logging/LogMsgDecorators.h"
+#include "arm_compute/core/utils/logging/Types.h"
+
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace arm_compute
+{
+namespace logging
+{
+/** Logger class */
+class Logger
+{
+public:
+    /** Default Constructor
+     *
+     * @param[in] name      Name of the logger
+     * @param[in] log_level Logger log level
+     * @param[in] printer   Printer to push the messages
+     */
+    Logger(std::string name, LogLevel log_level, std::shared_ptr<Printer> printer);
+    /** Default Constructor
+     *
+     * @param[in] name      Name of the logger
+     * @param[in] log_level Logger log level
+     * @param[in] printers  Printers to push the messages
+     */
+    Logger(std::string name, LogLevel log_level, std::vector<std::shared_ptr<Printer>> printers = {});
+    /** Default Constructor
+     *
+     * @param[in] name       Name of the logger
+     * @param[in] log_level  Logger log level
+     * @param[in] printers   Printers to push the messages
+     * @param[in] decorators Message decorators, which append information in the logged message
+     */
+    Logger(std::string                              name,
+           LogLevel                                 log_level,
+           std::vector<std::shared_ptr<Printer>>    printers,
+           std::vector<std::unique_ptr<IDecorator>> decorators);
+    /** Allow instances of this class to be moved */
+    Logger(Logger &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    Logger(const Logger &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    Logger &operator=(const Logger &) = delete;
+    /** Allow instances of this class to be moved */
+    Logger &operator=(Logger &&) = default;
+    /** Logs a message
+     *
+     * @param[in] log_level Log level of the message
+     * @param[in] msg       Message to log
+     */
+    void log(LogLevel log_level, const std::string &msg);
+    /** Logs a formatted message
+     *
+     * @param[in] log_level Log level of the message
+     * @param[in] fmt       Message format
+     * @param[in] args      Message arguments
+     */
+    template <typename... Ts>
+    void log(LogLevel log_level, const std::string &fmt, Ts &&... args);
+    /** Sets log level of the logger
+     *
+     * @warning Not thread-safe
+     *
+     * @param[in] log_level Log level to set
+     */
+    void set_log_level(LogLevel log_level);
+    /** Returns logger's log level
+     *
+     * @return Logger's log level
+     */
+    LogLevel log_level() const;
+    /** Returns logger's name
+     *
+     * @return Logger's name
+     */
+    std::string name() const;
+    /** Adds a printer to the logger
+     *
+     * @warning Not thread-safe
+     *
+     * @param[in] printer
+     */
+    void add_printer(std::shared_ptr<Printer> printer);
+    /** Adds a log message decorator to the logger
+     *
+     * @warning Not thread-safe
+     *
+     * @param[in] decorator
+     */
+    void add_decorator(std::unique_ptr<IDecorator> decorator);
+
+private:
+    /** Set default message decorators */
+    void set_default_decorators();
+    /** Checks if a message should be logged depending
+     *  on the message log level and the loggers one
+     *
+     * @param[in] log_level Log level
+     *
+     * @return True if message should be logged else false
+     */
+    bool is_loggable(LogLevel log_level);
+    /** Decorate log message
+     *
+     * @param[in] Log message to decorate
+     */
+    void decorate_log_msg(LogMsg &msg);
+    /** Creates final log message by creating the prefix
+     *
+     * @param[in] str       Log message
+     * @param[in] log_level Message's log level
+     *
+     * @return Final log message to print
+     */
+    std::string create_log_msg(const std::string &str, LogLevel log_level);
+    /** Prints the message to all the printers
+     *
+     * @param[in] msg Message to print
+     */
+    void print_all(const std::string &msg);
+
+private:
+    std::string                              _name;
+    LogLevel                                 _log_level;
+    std::vector<std::shared_ptr<Printer>>    _printers;
+    std::vector<std::unique_ptr<IDecorator>> _decorators;
+};
+
+template <typename... Ts>
+inline void Logger::log(LogLevel log_level, const std::string &fmt, Ts &&... args)
+{
+    // Return if message shouldn't be logged
+    // i.e. if log level does not match the logger's
+    if(!is_loggable(log_level))
+    {
+        return;
+    }
+
+    // Print message to all printers
+    print_all(create_log_msg(string_with_format(fmt, args...), log_level));
+}
+} // namespace logging
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_LOGGING_LOGGER_H__ */
diff --git a/arm_compute/core/utils/logging/LoggerRegistry.h b/arm_compute/core/utils/logging/LoggerRegistry.h
new file mode 100644
index 000000000..d3c691139
--- /dev/null
+++ b/arm_compute/core/utils/logging/LoggerRegistry.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LOGGING_LOGGER_REGISTRY_H__
+#define __ARM_COMPUTE_LOGGING_LOGGER_REGISTRY_H__
+
+#include "arm_compute/core/utils/logging/Logger.h"
+#include "arm_compute/core/utils/logging/Printers.h"
+#include "arm_compute/core/utils/logging/Types.h"
+#include "support/Mutex.h"
+
+#include <memory>
+#include <set>
+#include <unordered_map>
+
+namespace arm_compute
+{
+namespace logging
+{
+/** Registry class holding all the instantiated loggers */
+class LoggerRegistry final
+{
+public:
+    /** Gets registry instance
+     *
+     * @return Logger registry instance
+     */
+    static LoggerRegistry &get();
+    /** Creates a logger
+     *
+     * @note Some names are reserved e.g. [CORE, RUNTIME, GRAPH]
+     *
+     * @param[in] name      Logger's name
+     * @param[in] log_level Logger's log level. Defaults to @ref LogLevel::INFO
+     * @param[in] printers  Printers to attach to the system loggers. Defaults with a @ref StdPrinter.
+     */
+    void create_logger(const std::string &name, LogLevel log_level = LogLevel::INFO,
+                       std::vector<std::shared_ptr<Printer>> printers = { std::make_shared<StdPrinter>() });
+    /** Remove a logger
+     *
+     * @param name Logger's name
+     */
+    void remove_logger(const std::string &name);
+    /** Returns a logger instance
+     *
+     * @param[in] name Logger to return
+     *
+     * @return Logger
+     */
+    std::shared_ptr<Logger> logger(const std::string &name);
+    /** Creates reserved library loggers
+     *
+     * @param[in] log_level (Optional) Logger's log level. Defaults to @ref LogLevel::INFO
+     * @param[in] printers  (Optional) Printers to attach to the system loggers. Defaults with a @ref StdPrinter.
+     */
+    void create_reserved_loggers(LogLevel                              log_level = LogLevel::INFO,
+                                 std::vector<std::shared_ptr<Printer>> printers  = { std::make_shared<StdPrinter>() });
+
+private:
+    /** Default constructor */
+    LoggerRegistry();
+
+private:
+    arm_compute::Mutex _mtx;
+    std::unordered_map<std::string, std::shared_ptr<Logger>> _loggers;
+    static std::set<std::string> _reserved_loggers;
+};
+} // namespace logging
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_LOGGING_LOGGER_REGISTRY_H__ */
diff --git a/arm_compute/core/utils/logging/Macros.h b/arm_compute/core/utils/logging/Macros.h
new file mode 100644
index 000000000..bc121e25e
--- /dev/null
+++ b/arm_compute/core/utils/logging/Macros.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LOGGING_MACROS_H__
+#define __ARM_COMPUTE_LOGGING_MACROS_H__
+
+#include "arm_compute/core/utils/logging/LoggerRegistry.h"
+
+#include <sstream>
+
+#ifdef ARM_COMPUTE_LOGGING_ENABLED
+
+#define ARM_COMPUTE_LOG_MSG(logger_name, log_level, msg)                                 \
+    do                                                                                   \
+    {                                                                                    \
+        auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name); \
+        if(__logger != nullptr)                                                          \
+        {                                                                                \
+            __logger->log(log_level, msg);                                               \
+        }                                                                                \
+    } while(false)
+
+#define ARM_COMPUTE_LOG_MSG_WITH_FORMAT(logger_name, log_level, fmt, ...)                \
+    do                                                                                   \
+    {                                                                                    \
+        auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name); \
+        if(__logger != nullptr)                                                          \
+        {                                                                                \
+            __logger->log(log_level, fmt, __VA_ARGS__);                                  \
+        }                                                                                \
+    } while(false)
+
+#define ARM_COMPUTE_LOG_STREAM(logger_name, log_level, stream)                                                 \
+    do                                                                                                         \
+    {                                                                                                          \
+        auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name);                       \
+        if(__logger != nullptr)                                                                                \
+        {                                                                                                      \
+            __logger->log(log_level, static_cast<std::ostringstream &>(std::ostringstream() << stream).str()); \
+        }                                                                                                      \
+    } while(false)
+
+#else /* ARM_COMPUTE_LOGGING_ENABLED */
+
+#define ARM_COMPUTE_LOG_MSG(logger_name, log_level, msg)
+#define ARM_COMPUTE_LOG_MSG_WITH_FORMAT(logger_name, log_level, fmt, ...)
+#define ARM_COMPUTE_LOG_STREAM(logger_name, log_level, stream)
+
+#endif /* ARM_COMPUTE_LOGGING_ENABLED */
+
+#endif /* __ARM_COMPUTE_LOGGING_MACROS_H__ */
diff --git a/arm_compute/core/utils/logging/Printers.h b/arm_compute/core/utils/logging/Printers.h
new file mode 100644
index 000000000..7e5eef6a0
--- /dev/null
+++ b/arm_compute/core/utils/logging/Printers.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LOGGING_PRINTERS_H__
+#define __ARM_COMPUTE_LOGGING_PRINTERS_H__
+
+#include "arm_compute/core/utils/logging/FilePrinter.h"
+#include "arm_compute/core/utils/logging/IPrinter.h"
+#include "arm_compute/core/utils/logging/StdPrinter.h"
+
+#endif /* __ARM_COMPUTE_LOGGING_PRINTERS_H__ */
diff --git a/arm_compute/core/utils/logging/StdPrinter.h b/arm_compute/core/utils/logging/StdPrinter.h
new file mode 100644
index 000000000..0b41b2602
--- /dev/null
+++ b/arm_compute/core/utils/logging/StdPrinter.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LOGGING_STD_PRINTER_H__
+#define __ARM_COMPUTE_LOGGING_STD_PRINTER_H__
+
+#include "arm_compute/core/utils/logging/IPrinter.h"
+
+#include <iostream>
+
+namespace arm_compute
+{
+namespace logging
+{
+/** Std Printer */
+class StdPrinter final : public Printer
+{
+private:
+    // Inherited methods overridden:
+    void print_internal(const std::string &msg) override
+    {
+        std::cout << msg << std::endl;
+    }
+};
+} // namespace logging
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_LOGGING_STD_PRINTER_H__ */
diff --git a/arm_compute/core/utils/logging/Types.h b/arm_compute/core/utils/logging/Types.h
new file mode 100644
index 000000000..171270d4e
--- /dev/null
+++ b/arm_compute/core/utils/logging/Types.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LOGGING_TYPES_H__
+#define __ARM_COMPUTE_LOGGING_TYPES_H__
+
+#include <string>
+
+namespace arm_compute
+{
+namespace logging
+{
+/** Logging level enumeration */
+enum class LogLevel : unsigned int
+{
+    VERBOSE, /**< All logging messages */
+    INFO,    /**< Information log level */
+    WARN,    /**< Warning log level */
+    OFF      /**< No logging */
+};
+
+struct LogMsg
+{
+    LogMsg()
+        : raw_(), log_level_(LogLevel::OFF)
+    {
+    }
+    LogMsg(std::string msg, LogLevel log_level = LogLevel::OFF)
+        : raw_(msg), log_level_(log_level)
+    {
+    }
+
+    std::string raw_;
+    LogLevel    log_level_;
+};
+} // namespace logging
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TYPES_H__ */
diff --git a/arm_compute/core/utils/misc/ICloneable.h b/arm_compute/core/utils/misc/ICloneable.h
new file mode 100644
index 000000000..5852f14f7
--- /dev/null
+++ b/arm_compute/core/utils/misc/ICloneable.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_MISC_ICLONEABLE_H__
+#define __ARM_COMPUTE_MISC_ICLONEABLE_H__
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace misc
+{
+/** Clonable Interface */
+template <class T>
+class ICloneable
+{
+public:
+    /** Default virtual desctructor */
+    virtual ~ICloneable() = default;
+    /** Provide a clone of the current object of class T
+     *
+     * @return Clone object of class T
+     */
+    virtual std::unique_ptr<T> clone() const = 0;
+};
+} // namespace misc
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_MISC_ICLONEABLE_H__ */
diff --git a/arm_compute/core/utils/misc/utility.h b/arm_compute/core/utils/misc/utility.h
new file mode 100644
index 000000000..898d0cdea
--- /dev/null
+++ b/arm_compute/core/utils/misc/utility.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_MISC_UTILITY_H__
+#define __ARM_COMPUTE_MISC_UTILITY_H__
+
+#include <array>
+
+namespace arm_compute
+{
+namespace utility
+{
+/** @cond */
+template <std::size_t...>
+struct index_sequence
+{
+};
+
+template <std::size_t N, std::size_t... S>
+struct index_sequence_generator : index_sequence_generator < N - 1, N - 1, S... >
+{
+};
+
+template <std::size_t... S>
+struct index_sequence_generator<0u, S...> : index_sequence<S...>
+{
+    using type = index_sequence<S...>;
+};
+
+template <std::size_t N>
+using index_sequence_t = typename index_sequence_generator<N>::type;
+/** @endcond */
+
+namespace detail
+{
+template <std::size_t... S,
+          typename Iterator,
+          typename T = std::array<typename std::iterator_traits<Iterator>::value_type, sizeof...(S)>>
+T make_array(Iterator first, index_sequence<S...>)
+{
+    return T{ { first[S]... } };
+}
+} // namespace detail
+
+template <std::size_t N, typename Iterator>
+std::array<typename std::iterator_traits<Iterator>::value_type, N> make_array(Iterator first, Iterator last)
+{
+    return detail::make_array(first, index_sequence_t<N> {});
+}
+} // namespace misc
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_MISC_UTILITY_H__ */
diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h
new file mode 100644
index 000000000..6fd1d8001
--- /dev/null
+++ b/arm_compute/core/utils/quantization/AsymmHelpers.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_QUANTIZATION_ASYMM_HELPERS_H__
+#define __ARM_COMPUTE_QUANTIZATION_ASYMM_HELPERS_H__
+
+#include "arm_compute/core/Error.h"
+
+namespace arm_compute
+{
+namespace quantization
+{
+/** Calculate quantized representation of multiplier with value less than one.
+ *
+ * @param[in]  multiplier       Real multiplier.
+ * @param[out] quant_multiplier Integer multiplier.
+ * @param[out] right_shift      Right bit shift.
+ *
+ * @return a status
+ */
+arm_compute::Status calculate_quantized_multiplier_less_than_one(double multiplier, int *quant_multiplier, int *right_shift);
+/** Calculate quantized representation of multiplier having value greater than one.
+ *
+ * @param[in]  multiplier           Real multiplier.
+ * @param[out] quantized_multiplier Integer multiplier.
+ * @param[out] left_shift           Left bit shift.
+ *
+ * @return a status
+ */
+arm_compute::Status calculate_quantized_multiplier_greater_than_one(double multiplier, int *quantized_multiplier, int *left_shift);
+} // namespace quantization
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_IO_FILE_HANDLER_H__ */
diff --git a/arm_compute/graph/CL/CLMap.h b/arm_compute/graph/CL/CLMap.h
index a205ebcad..732a1df77 100644
--- a/arm_compute/graph/CL/CLMap.h
+++ b/arm_compute/graph/CL/CLMap.h
@@ -29,11 +29,11 @@
 
 namespace arm_compute
 {
-class CLTensor;
+class ICLTensor;
 
 namespace graph
 {
-class Tensor;
+class ITensorObject;
 /** OpenCL map function */
 class CLMap : public arm_compute::IFunction
 {
@@ -43,7 +43,7 @@ public:
      * @param[in] tensor   Tensor to map
      * @param[in] blocking Flag to specify if the map should be blocking or not (defaults to false)
      */
-    CLMap(Tensor *tensor, bool blocking = false);
+    CLMap(ITensorObject *tensor, bool blocking = false);
     /** Prevent instances from being copy constructed */
     CLMap(const CLMap &) = delete;
     /** Prevent instances from being copy assigned */
@@ -57,8 +57,8 @@ public:
     void run() override;
 
 private:
-    arm_compute::CLTensor *_tensor;   /**< Tensor */
-    bool                   _blocking; /**< Blocking flag */
+    arm_compute::ICLTensor *_tensor;   /**< Tensor */
+    bool                    _blocking; /**< Blocking flag */
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/CL/CLUnmap.h b/arm_compute/graph/CL/CLUnmap.h
index a72706353..17745c436 100644
--- a/arm_compute/graph/CL/CLUnmap.h
+++ b/arm_compute/graph/CL/CLUnmap.h
@@ -29,11 +29,11 @@
 
 namespace arm_compute
 {
-class CLTensor;
+class ICLTensor;
 
 namespace graph
 {
-class Tensor;
+class ITensorObject;
 /** OpenCL un-map function */
 class CLUnmap : public arm_compute::IFunction
 {
@@ -42,7 +42,7 @@ public:
      *
      * @param[in] tensor Tensor to un-map
      */
-    CLUnmap(Tensor *tensor);
+    CLUnmap(ITensorObject *tensor);
     /** Prevent instances from being copy constructed */
     CLUnmap(const CLUnmap &) = delete;
     /** Prevent instances from being copy assigned */
@@ -56,7 +56,7 @@ public:
     void run() override;
 
 private:
-    arm_compute::CLTensor *_tensor; /**< Tensor */
+    arm_compute::ICLTensor *_tensor; /**< Tensor */
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/Error.h b/arm_compute/graph/Error.h
new file mode 100644
index 000000000..0c8ed266c
--- /dev/null
+++ b/arm_compute/graph/Error.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GRAPH_ERROR_H__
+#define __ARM_COMPUTE_GRAPH_ERROR_H__
+
+#include "arm_compute/graph/ITensorObject.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Evaluate if a tensor object is null. If the condition is true then an error message is printed and an exception thrown
+ *
+ * @param[in] function       Function in which the error occurred.
+ * @param[in] file           Name of the file where the error occurred.
+ * @param[in] line           Line on which the error occurred.
+ * @param[in] tensor_object  Tensor object to evaluate
+ * @param[in] tensor_objects (Optional) Further allowed tensor objects.
+ */
+template <typename... Ts>
+void error_on_unallocated_tensor_object(const char *function, const char *file, int line,
+                                        const ITensorObject *tensor_object, Ts... tensor_objects)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(tensor_object);
+
+    ARM_COMPUTE_ERROR_ON_LOC(tensor_object == nullptr || tensor_object->tensor() == nullptr, function, file, line);
+
+    const std::array<const ITensorObject *, sizeof...(Ts)> tensor_objects_array{ { std::forward<Ts>(tensor_objects)... } };
+    ARM_COMPUTE_UNUSED(tensor_objects_array);
+
+    ARM_COMPUTE_ERROR_ON_LOC(std::any_of(tensor_objects_array.begin(), tensor_objects_array.end(), [&](const ITensorObject * tensor_obj)
+    {
+        return (tensor_obj == nullptr || tensor_object->tensor() == nullptr);
+    }),
+    function, file, line);
+}
+#define ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(...) ::arm_compute::graph::error_on_unallocated_tensor_object(__func__, __FILE__, __LINE__, __VA_ARGS__)
+} // namespace graph
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GRAPH_ERROR_H__ */
diff --git a/arm_compute/graph/Graph.h b/arm_compute/graph/Graph.h
index 9d06f44be..ab1d8b886 100644
--- a/arm_compute/graph/Graph.h
+++ b/arm_compute/graph/Graph.h
@@ -25,6 +25,8 @@
 #define __ARM_COMPUTE_GRAPH_GRAPH_H__
 
 #include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/ITensorObject.h"
+#include "arm_compute/graph/SubTensor.h"
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/Types.h"
 #include "support/ToolchainSupport.h"
@@ -64,7 +66,10 @@ public:
      *
      * @param[in] tensor Tensor to add
      */
-    void add_tensor(std::unique_ptr<Tensor> tensor);
+    void add_tensor_object(std::unique_ptr<ITensorObject> tensor);
+    /** Finalizes the current node's configuration
+     */
+    static bool opencl_is_available();
     /** Manually sets the output of the current node
      *
      * @param[in] tmp Output info to set
@@ -98,6 +103,14 @@ Graph &operator<<(Graph &graph, TensorInfo &&info);
  * @return Updated graph
  */
 Graph &operator<<(Graph &graph, Tensor &&tensor);
+/** Overloaded stream operator to add a sub-tensor to the graph
+ *
+ * @param[in, out] graph      Graph to add the tensor
+ * @param[in]      sub_tensor Sub-tensor to be added
+ *
+ * @return Updated graph
+ */
+Graph &operator<<(Graph &graph, SubTensor &&sub_tensor);
 /** Overloaded stream operator to provide a target hint to the graph
  *
  * @param[in, out] graph       Graph to provide the hint to
diff --git a/arm_compute/graph/INode.h b/arm_compute/graph/INode.h
index 1b22bdf63..56b50b942 100644
--- a/arm_compute/graph/INode.h
+++ b/arm_compute/graph/INode.h
@@ -25,6 +25,7 @@
 #define __ARM_COMPUTE_GRAPH_INODE_H__
 
 #include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/ITensorObject.h"
 #include "arm_compute/graph/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
@@ -46,7 +47,7 @@ public:
      * @param[in] input  Input tensor of the node
      * @param[in] output Output tensor of the node
      */
-    virtual std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) = 0;
+    virtual std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) = 0;
     /** Override the existing target hint
      *
      * @note If the input is DONT_CARE then the method has to pick a technology,
diff --git a/arm_compute/graph/IOperation.h b/arm_compute/graph/IOperation.h
new file mode 100644
index 000000000..a9fa4f83c
--- /dev/null
+++ b/arm_compute/graph/IOperation.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GRAPH_IOPERATION_H__
+#define __ARM_COMPUTE_GRAPH_IOPERATION_H__
+
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Operation functor interface */
+class IOperation
+{
+public:
+    /** Virtual Destructor */
+    virtual ~IOperation() = default;
+    /** Interface to be implemented that configures an operation
+     *
+     * @param[in] ctx Node parameters to be used by the operation
+     */
+    virtual std::unique_ptr<arm_compute::IFunction> configure(NodeContext &ctx) = 0;
+    /** Interface to be implemented that returns the target of the operation
+     *
+     * @return Target of the operation
+     */
+    virtual TargetHint target() const = 0;
+};
+
+#define REGISTER_SIMPLE_OPERATION(NAME, TARGET, OP)                                \
+    class NAME : public IOperation                                                 \
+    {                                                                              \
+    public:                                                                    \
+        std::unique_ptr<arm_compute::IFunction> configure(NodeContext &ctx) final; \
+        TargetHint target() const final                                            \
+        {                                                                          \
+            return TargetHint::TARGET;                                             \
+        }                                                                          \
+    };                                                                             \
+    static detail::OperationRegistrar<NAME> NAME##_registrar(OP);                  \
+    std::unique_ptr<arm_compute::IFunction> NAME::configure(NodeContext &ctx)
+
+} // namespace graph
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GRAPH_IOPERATION_H__ */
diff --git a/arm_compute/graph/ITensorObject.h b/arm_compute/graph/ITensorObject.h
new file mode 100644
index 000000000..a922dd53f
--- /dev/null
+++ b/arm_compute/graph/ITensorObject.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GRAPH_ITENSOROBJECT_H__
+#define __ARM_COMPUTE_GRAPH_ITENSOROBJECT_H__
+
+#include "arm_compute/graph/ITensorAccessor.h"
+#include "arm_compute/graph/Types.h"
+#include "support/ToolchainSupport.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Tensor object interface */
+class ITensorObject
+{
+public:
+    /** Default Destructor */
+    virtual ~ITensorObject() = default;
+    /** Calls accessor on tensor
+     *
+     * @return True if succeeds else false
+     */
+    virtual bool call_accessor() = 0;
+    /** Checks if tensor has an accessor set.
+     *
+     * @return True if an accessor has been set else false
+     */
+    virtual bool has_accessor() const = 0;
+    /** Sets target of the tensor
+     *
+     * @param[in] target Target where the tensor should be pinned in
+     *
+     * @return Backend tensor
+     */
+    virtual ITensor *set_target(TargetHint target) = 0;
+    /** Returns a pointer to the internal tensor
+     *
+     * @return Tensor
+     */
+    virtual ITensor       *tensor()       = 0;
+    virtual const ITensor *tensor() const = 0;
+    /** Return the target that this tensor is pinned on
+     *
+     * @return Target of the tensor
+     */
+    virtual TargetHint target() const = 0;
+    /** Allocates the tensor */
+    virtual void allocate() = 0;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GRAPH_ITENSOROBJECT_H__ */
diff --git a/arm_compute/graph/NodeContext.h b/arm_compute/graph/NodeContext.h
new file mode 100644
index 000000000..bc90f217a
--- /dev/null
+++ b/arm_compute/graph/NodeContext.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GRAPH_NODE_CONTEXT_H__
+#define __ARM_COMPUTE_GRAPH_NODE_CONTEXT_H__
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/graph/NodeParameter.h"
+#include "arm_compute/graph/Types.h"
+#include "support/ToolchainSupport.h"
+
+#include <map>
+#include <memory>
+#include <string>
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Node Context class
+ *
+ * Node context class is used to hold all the parameters required by a node to execute
+ */
+class NodeContext
+{
+public:
+    /** Default Constructor
+     *
+     * @param[in] operation Name of the operation
+     */
+    NodeContext(OperationType operation)
+        : _operation(operation), _target(TargetHint::DONT_CARE), _inputs(), _outputs(), _parameters() {};
+    /** Sets the execution target of the node
+     *
+     * @param[in] target Execution target of the node
+     */
+    void set_target(TargetHint target);
+    /** Adds an input tensor to the context
+     *
+     * @param[in] input Input to add
+     */
+    void add_input(arm_compute::ITensor *input);
+    /** Adds and output to the context
+     *
+     * @param[in] output Output to add
+     */
+    void add_output(arm_compute::ITensor *output);
+    /** Adds a parameter to the context
+     *
+     * @param[in] name      Parameter name
+     * @param[in] parameter Parameter to add
+     */
+    template <typename T>
+    void add_parameter(std::string name, T parameter);
+    /** Returns the operation of this node.
+     *
+     * @return The operation type
+     */
+    OperationType operation() const;
+    /** Returns the execution target of this node
+     *
+     * @return The execution target
+     */
+    TargetHint target() const;
+    /** Returns input tensor of a given index
+     *
+     * @param[in] idx Index of the input tensor
+     *
+     * @return A pointer the requested input tensor else nullptr
+     */
+    arm_compute::ITensor *input(size_t idx) const;
+    /** Returns output tensor of a given index
+     *
+     * @param[in] idx Index of the output tensor
+     *
+     * @return A pointer the requested output tensor else nullptr
+     */
+    arm_compute::ITensor *output(size_t idx) const;
+    /** Returns the parameter with the given name
+     *
+     * @param[in] name Parameter name
+     *
+     * @return The requested parameter else an empty object
+     */
+    template <typename T>
+    T parameter(std::string name) const;
+    /** Returns number of inputs
+     *
+     * @return Number of inputs
+     */
+    size_t num_inputs() const;
+    /** Returns number of output
+     *
+     * @return Number of outputs
+     */
+    size_t num_outputs() const;
+
+private:
+    OperationType                       _operation;
+    TargetHint                          _target;
+    std::vector<arm_compute::ITensor *> _inputs;
+    std::vector<arm_compute::ITensor *> _outputs;
+    std::map<std::string, std::unique_ptr<NodeParameterBase>> _parameters;
+};
+
+template <typename T>
+inline void NodeContext::add_parameter(std::string name, T parameter)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_parameters.find(name) != _parameters.end(), "Parameter already exists!");
+    _parameters[name] = support::cpp14::make_unique<NodeParameter<T>>(name, parameter);
+}
+
+template <typename T>
+inline T NodeContext::parameter(std::string name) const
+{
+    auto it = _parameters.find(name);
+    ARM_COMPUTE_ERROR_ON(it == _parameters.end());
+    return static_cast<NodeParameter<T> *>(it->second.get())->value();
+}
+} // namespace graph
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GRAPH_NODE_CONTEXT_H__ */
diff --git a/arm_compute/graph/NodeParameter.h b/arm_compute/graph/NodeParameter.h
new file mode 100644
index 000000000..9d3823d54
--- /dev/null
+++ b/arm_compute/graph/NodeParameter.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GRAPH_NODE_PARAMETER_H__
+#define __ARM_COMPUTE_GRAPH_NODE_PARAMETER_H__
+
+#include <ostream>
+#include <string>
+
+namespace arm_compute
+{
+namespace graph
+{
+/**Node Parameter Empty base class */
+class NodeParameterBase
+{
+};
+
+/** Template parameter implementation */
+template <typename T>
+class NodeParameter : public NodeParameterBase
+{
+public:
+    /** Default Constructor
+     *
+     * @param[in] name Paremeter name
+     * @param[in] val  Parameter value
+     */
+    NodeParameter(std::string name, T val)
+        : _name(name), _val(val) {};
+    /** Returns parameter's name
+     *
+     * @return the name of the parameter
+     */
+    std::string name() const
+    {
+        return _name;
+    }
+    /** Returns parameter's value
+     *
+     * @return the value of the parameter
+     */
+    T value()
+    {
+        return _val;
+    }
+
+private:
+    std::string _name;
+    T           _val;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GRAPH_NODE_PARAMETER_H__ */
diff --git a/arm_compute/graph/Nodes.h b/arm_compute/graph/Nodes.h
index 548deabeb..0282e1d2a 100644
--- a/arm_compute/graph/Nodes.h
+++ b/arm_compute/graph/Nodes.h
@@ -26,12 +26,19 @@
 
 #include "arm_compute/graph/nodes/ActivationLayer.h"
 #include "arm_compute/graph/nodes/BatchNormalizationLayer.h"
+#include "arm_compute/graph/nodes/BranchLayer.h"
 #include "arm_compute/graph/nodes/ConvolutionLayer.h"
+#include "arm_compute/graph/nodes/DepthConvertLayer.h"
+#include "arm_compute/graph/nodes/DepthwiseConvolutionLayer.h"
+#include "arm_compute/graph/nodes/DequantizationLayer.h"
+#include "arm_compute/graph/nodes/FlattenLayer.h"
 #include "arm_compute/graph/nodes/FloorLayer.h"
 #include "arm_compute/graph/nodes/FullyConnectedLayer.h"
 #include "arm_compute/graph/nodes/L2NormalizeLayer.h"
 #include "arm_compute/graph/nodes/NormalizationLayer.h"
 #include "arm_compute/graph/nodes/PoolingLayer.h"
+#include "arm_compute/graph/nodes/QuantizationLayer.h"
+#include "arm_compute/graph/nodes/ReshapeLayer.h"
 #include "arm_compute/graph/nodes/SoftmaxLayer.h"
 
 #endif /* __ARM_COMPUTE_GRAPH_NODES_H__ */
diff --git a/arm_compute/graph/OperationRegistrar.h b/arm_compute/graph/OperationRegistrar.h
new file mode 100644
index 000000000..ee171c351
--- /dev/null
+++ b/arm_compute/graph/OperationRegistrar.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_GRAPH_OPERATION_REGISTRAR
+#define ARM_COMPUTE_GRAPH_OPERATION_REGISTRAR
+
+#include "arm_compute/graph/OperationRegistry.h"
+#include "arm_compute/graph/Types.h"
+
+#include <string>
+#include <utility>
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace detail
+{
+/** Helper class to statically register an operation */
+template <typename T>
+class OperationRegistrar final
+{
+public:
+    /** Add a new test case with the given name to the framework.
+     *
+     * @param[in] operation Operation type
+     */
+    OperationRegistrar(OperationType operation);
+};
+
+template <typename T>
+inline OperationRegistrar<T>::OperationRegistrar(OperationType operation)
+{
+    OperationRegistry::get().add_operation<T>(std::move(operation));
+}
+} // namespace detail
+} // namespace graph
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_GRAPH_OPERATION_REGISTRAR */
+\ No newline at end of file
diff --git a/arm_compute/graph/OperationRegistry.h b/arm_compute/graph/OperationRegistry.h
new file mode 100644
index 000000000..ae68bf45a
--- /dev/null
+++ b/arm_compute/graph/OperationRegistry.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GRAPH_OPERATION_REGISTRY_H__
+#define __ARM_COMPUTE_GRAPH_OPERATION_REGISTRY_H__
+
+#include "arm_compute/graph/IOperation.h"
+#include "arm_compute/graph/Types.h"
+#include "support/ToolchainSupport.h"
+
+#include <map>
+#include <memory>
+#include <string>
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Registry holding all the supported operations */
+class OperationRegistry
+{
+public:
+    /** Gets operation registry instance
+     *
+     * @return Operation registry instance
+     */
+    static OperationRegistry &get();
+    /** Finds an operation in the registry
+     *
+     * @param[in] operation Type of the operation to find
+     * @param[in] target    Target of the operation
+     *
+     * @return Pointer to the operation functor if found, else nullptr
+     */
+    IOperation *find_operation(OperationType operation, TargetHint target);
+    /** Checks if an operation for a given target exists
+     *
+     * @param[in] operation Operation type
+     * @param[in] target    Execution target
+     *
+     * @return True if exists else false
+     */
+    bool contains(OperationType operation, TargetHint target) const;
+    /** Registers an operation to the registry
+     *
+     * @param operation Operation to register
+     */
+    template <typename T>
+    void add_operation(OperationType operation);
+
+private:
+    /** Default Constructor */
+    OperationRegistry();
+
+private:
+    std::map<OperationType, std::vector<std::unique_ptr<IOperation>>> _registered_ops;
+};
+
+template <typename T>
+inline void OperationRegistry::add_operation(OperationType operation)
+{
+    _registered_ops[operation].emplace_back(support::cpp14::make_unique<T>());
+}
+} // namespace graph
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GRAPH_OPERATION_REGISTRY_H__ */
diff --git a/arm_compute/graph/SubGraph.h b/arm_compute/graph/SubGraph.h
new file mode 100644
index 000000000..d768bf911
--- /dev/null
+++ b/arm_compute/graph/SubGraph.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GRAPH_SUBGRAPH_H__
+#define __ARM_COMPUTE_GRAPH_SUBGRAPH_H__
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/ITensorObject.h"
+#include "arm_compute/graph/SubTensor.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace graph
+{
+/** SubGraph class */
+class SubGraph
+{
+public:
+    /** Constructor */
+    SubGraph();
+    /** Adds a node to the graph
+     *
+     * @param[in] node Node to add
+     */
+    void add_node(std::unique_ptr<INode> node);
+    /** Adds a tensor to the graph
+     *
+     * @param[in] tensor Tensor to add
+     */
+    void add_tensor_object(std::unique_ptr<ITensorObject> tensor);
+    /** Constructs a graph from a subgraph
+     *
+     * @param[in] hint   Execution target hint
+     * @param[in] input  Input to the graph
+     * @param[in] output Output to the graph
+     *
+     * @return A graph
+     */
+    std::unique_ptr<Graph> construct(TargetHint hint, std::unique_ptr<ITensorObject> input, std::unique_ptr<ITensorObject> output);
+    /** Checks if the subgraph has an input
+     *
+     * @return True if the sub-graph has an input else false
+     */
+    bool has_input() const;
+    /** Checks if the subgraph has an output
+     *
+     * @return True if the sub-graph has an output else false
+     */
+    bool has_output() const;
+
+private:
+    std::vector<std::unique_ptr<INode>> _nodes;
+    std::unique_ptr<ITensorObject>      _input;
+    std::unique_ptr<ITensorObject>      _output;
+};
+
+SubGraph &operator<<(SubGraph &graph, Tensor &&tensor);
+SubGraph &operator<<(SubGraph &graph, SubTensor &&sub_tensor);
+
+template <typename Node>
+SubGraph &operator<<(SubGraph &sub_graph, Node node)
+{
+    sub_graph.add_node(arm_compute::support::cpp14::make_unique<Node>(std::move(node)));
+    return sub_graph;
+}
+} // namespace graph
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GRAPH_INODE_H__ */
diff --git a/arm_compute/graph/SubTensor.h b/arm_compute/graph/SubTensor.h
index ace93d20a..72aa78927 100644
--- a/arm_compute/graph/SubTensor.h
+++ b/arm_compute/graph/SubTensor.h
@@ -25,6 +25,7 @@
 #define __ARM_COMPUTE_GRAPH_SUBTENSOR_H__
 
 #include "arm_compute/graph/ITensorAccessor.h"
+#include "arm_compute/graph/ITensorObject.h"
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/Types.h"
 #include "support/ToolchainSupport.h"
@@ -36,7 +37,7 @@ namespace arm_compute
 namespace graph
 {
 /** SubTensor class */
-class SubTensor final
+class SubTensor final : public ITensorObject
 {
 public:
     /** Default Constructor */
@@ -55,7 +56,7 @@ public:
      * @param[in] coords       Starting coordinates of the sub-tensor in the parent tensor
      * @param[in] target       Execution target
      */
-    SubTensor(ITensor *parent, TensorShape tensor_shape, Coordinates coords, TargetHint target);
+    SubTensor(arm_compute::ITensor *parent, TensorShape tensor_shape, Coordinates coords, TargetHint target);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     SubTensor(const SubTensor &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -67,37 +68,25 @@ public:
     /** Default Destructor */
     ~SubTensor() = default;
 
-    /** Sets the given TensorInfo to the tensor
-     *
-     * @param[in] info TensorInfo to set
-     */
-    void set_info(SubTensorInfo &&info);
-    /** Returns tensor's TensorInfo
-     *
-     * @return TensorInfo of the tensor
-     */
-    const SubTensorInfo &info() const;
-    /** Returns a pointer to the internal tensor
-     *
-     * @return Tensor
-     */
-    ITensor *tensor();
-    /** Return the target that this tensor is pinned on
-     *
-     * @return Target of the tensor
-     */
-    TargetHint target() const;
+    // Inherited methods overriden:
+    bool                  call_accessor() override;
+    bool                  has_accessor() const override;
+    arm_compute::ITensor *set_target(TargetHint target) override;
+    arm_compute::ITensor       *tensor() override;
+    const arm_compute::ITensor *tensor() const override;
+    TargetHint                  target() const override;
+    void                        allocate() override;
 
 private:
     /** Instantiates a sub-tensor */
     void instantiate_subtensor();
 
 private:
-    TargetHint               _target;    /**< Target that this tensor is pinned on */
-    Coordinates              _coords;    /**< SubTensor Coordinates */
-    SubTensorInfo            _info;      /**< SubTensor metadata */
-    ITensor                 *_parent;    /**< Parent tensor */
-    std::unique_ptr<ITensor> _subtensor; /**< SubTensor */
+    TargetHint                            _target;       /**< Target that this tensor is pinned on */
+    TensorShape                           _tensor_shape; /**< SubTensor shape */
+    Coordinates                           _coords;       /**< SubTensor Coordinates */
+    arm_compute::ITensor                 *_parent;       /**< Parent tensor */
+    std::unique_ptr<arm_compute::ITensor> _subtensor;    /**< SubTensor */
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/Tensor.h b/arm_compute/graph/Tensor.h
index dbe2ba595..e5821dc81 100644
--- a/arm_compute/graph/Tensor.h
+++ b/arm_compute/graph/Tensor.h
@@ -25,6 +25,7 @@
 #define __ARM_COMPUTE_GRAPH_TENSOR_H__
 
 #include "arm_compute/graph/ITensorAccessor.h"
+#include "arm_compute/graph/ITensorObject.h"
 #include "arm_compute/graph/Types.h"
 #include "support/ToolchainSupport.h"
 
@@ -35,7 +36,7 @@ namespace arm_compute
 namespace graph
 {
 /** Tensor class */
-class Tensor
+class Tensor final : public ITensorObject
 {
 public:
     /** Constructor
@@ -94,43 +95,28 @@ public:
      * @param[in] info TensorInfo to set
      */
     void set_info(TensorInfo &&info);
-    /** Calls accessor on tensor
-     *
-     * @return True if succeeds else false
-     */
-    bool call_accessor();
-    /** Sets target of the tensor
-     *
-     * @param[in] target Target where the tensor should be pinned in
-     *
-     * @return
-     */
-    ITensor *set_target(TargetHint target);
     /** Returns tensor's TensorInfo
      *
      * @return TensorInfo of the tensor
      */
     const TensorInfo &info() const;
-    /** Returns a pointer to the internal tensor
-     *
-     * @return Tensor
-     */
-    ITensor *tensor();
     /** Allocates and fills the tensor if needed */
     void allocate_and_fill_if_needed();
-    /** Allocates the tensor */
-    void allocate();
-    /** Return the target that this tensor is pinned on
-     *
-     * @return Target of the tensor
-     */
-    TargetHint target() const;
+
+    // Inherited methods overriden:
+    bool                  call_accessor() override;
+    bool                  has_accessor() const override;
+    arm_compute::ITensor *set_target(TargetHint target) override;
+    arm_compute::ITensor       *tensor() override;
+    const arm_compute::ITensor *tensor() const override;
+    TargetHint                  target() const override;
+    void                        allocate() override;
 
 private:
-    TargetHint                       _target;   /**< Target that this tensor is pinned on */
-    TensorInfo                       _info;     /**< Tensor metadata */
-    std::unique_ptr<ITensorAccessor> _accessor; /**< Tensor Accessor */
-    std::unique_ptr<ITensor>         _tensor;   /**< Tensor */
+    TargetHint                            _target;   /**< Target that this tensor is pinned on */
+    TensorInfo                            _info;     /**< Tensor metadata */
+    std::unique_ptr<ITensorAccessor>      _accessor; /**< Tensor Accessor */
+    std::unique_ptr<arm_compute::ITensor> _tensor;   /**< Tensor */
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/Types.h b/arm_compute/graph/Types.h
index e48ff84ab..f8d20615d 100644
--- a/arm_compute/graph/Types.h
+++ b/arm_compute/graph/Types.h
@@ -27,24 +27,50 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/SubTensorInfo.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/logging/Macros.h"
+
+/** Create a default core logger
+ *
+ * @note It will eventually create all default loggers in don't exist
+ */
+#define ARM_COMPUTE_CREATE_DEFAULT_GRAPH_LOGGER()                                  \
+    do                                                                             \
+    {                                                                              \
+        if(arm_compute::logging::LoggerRegistry::get().logger("GRAPH") == nullptr) \
+        {                                                                          \
+            arm_compute::logging::LoggerRegistry::get().create_reserved_loggers(); \
+        }                                                                          \
+    } while(false)
+
+#define ARM_COMPUTE_LOG_GRAPH(log_level, x)    \
+    ARM_COMPUTE_CREATE_DEFAULT_GRAPH_LOGGER(); \
+    ARM_COMPUTE_LOG_STREAM("GRAPH", log_level, x)
+
+#define ARM_COMPUTE_LOG_GRAPH_INFO(x)          \
+    ARM_COMPUTE_CREATE_DEFAULT_GRAPH_LOGGER(); \
+    ARM_COMPUTE_LOG_STREAM("GRAPH", arm_compute::logging::LogLevel::INFO, x)
 
 namespace arm_compute
 {
 namespace graph
 {
-using arm_compute::ITensor;
-using arm_compute::TensorInfo;
-using arm_compute::SubTensorInfo;
-using arm_compute::DataType;
-using arm_compute::Coordinates;
-using arm_compute::TensorShape;
-using arm_compute::PadStrideInfo;
-using arm_compute::WeightsInfo;
 using arm_compute::ActivationLayerInfo;
+using arm_compute::Coordinates;
+using arm_compute::DataType;
+using arm_compute::DimensionRoundingType;
+using arm_compute::ITensorInfo;
 using arm_compute::NormType;
 using arm_compute::NormalizationLayerInfo;
+using arm_compute::PadStrideInfo;
 using arm_compute::PoolingLayerInfo;
 using arm_compute::PoolingType;
+using arm_compute::SubTensorInfo;
+using arm_compute::TensorInfo;
+using arm_compute::TensorShape;
+using arm_compute::WeightsInfo;
+
+using arm_compute::logging::LogLevel;
+using arm_compute::ConvertPolicy;
 
 /**< Execution hint to the graph executor */
 enum class TargetHint
@@ -54,12 +80,38 @@ enum class TargetHint
     NEON       /**< Run node on a NEON capable device */
 };
 
-/**< Convolution method hint to the graph executor */
+/** Convolution method hint to the graph executor */
 enum class ConvolutionMethodHint
 {
     GEMM,  /**< Convolution using GEMM */
     DIRECT /**< Direct convolution */
 };
+
+/** Supported layer operations */
+enum class OperationType
+{
+    ActivationLayer,
+    BatchNormalizationLayer,
+    ConvolutionLayer,
+    DepthConvertLayer,
+    DepthwiseConvolutionLayer,
+    DequantizationLayer,
+    FlattenLayer,
+    FloorLayer,
+    FullyConnectedLayer,
+    L2NormalizeLayer,
+    NormalizationLayer,
+    PoolingLayer,
+    QuantizationLayer,
+    ReshapeLayer,
+    SoftmaxLayer
+};
+
+/** Branch layer merging method */
+enum class BranchMergeMethod
+{
+    DEPTH_CONCATENATE /**< Concatenate across depth */
+};
 } // namespace graph
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_GRAPH_TYPES_H__*/
diff --git a/arm_compute/graph/nodes/ActivationLayer.h b/arm_compute/graph/nodes/ActivationLayer.h
index efe8112e7..bc619a8df 100644
--- a/arm_compute/graph/nodes/ActivationLayer.h
+++ b/arm_compute/graph/nodes/ActivationLayer.h
@@ -26,7 +26,7 @@
 
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/INode.h"
-#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/ITensorObject.h"
 #include "arm_compute/graph/Types.h"
 
 namespace arm_compute
@@ -44,7 +44,7 @@ public:
     ActivationLayer(const ActivationLayerInfo activation_info);
 
     // Inherited methods overriden:
-    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override;
+    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override;
 
 private:
     const ActivationLayerInfo _activation_info; /**< Activation layer info */
diff --git a/arm_compute/graph/nodes/BatchNormalizationLayer.h b/arm_compute/graph/nodes/BatchNormalizationLayer.h
index f01cac236..df7b1d19a 100644
--- a/arm_compute/graph/nodes/BatchNormalizationLayer.h
+++ b/arm_compute/graph/nodes/BatchNormalizationLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/ITensorObject.h"
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/Types.h"
 
@@ -52,7 +53,7 @@ public:
     }
 
     // Inherited methods overriden:
-    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override;
+    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override;
 
 private:
     Tensor _mean;
diff --git a/arm_compute/graph/nodes/BranchLayer.h b/arm_compute/graph/nodes/BranchLayer.h
new file mode 100644
index 000000000..c71899f4f
--- /dev/null
+++ b/arm_compute/graph/nodes/BranchLayer.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GRAPH_BRANCH_LAYER_H__
+#define __ARM_COMPUTE_GRAPH_BRANCH_LAYER_H__
+
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/ITensorObject.h"
+#include "arm_compute/graph/SubGraph.h"
+#include "arm_compute/graph/SubTensor.h"
+#include "arm_compute/graph/Types.h"
+
+#include "arm_compute/core/Helpers.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Branch Layer node */
+class BranchLayer final : public INode
+{
+public:
+    /** Default Constructor
+     *
+     * @param[in] merge_method    Branch merging method
+     * @param[in] sub_graph1      First graph branch
+     * @param[in] sub_graph2      Second graph branch
+     * @param[in] rest_sub_graphs Rest sub-graph branches
+     */
+    template <typename... Ts>
+    BranchLayer(BranchMergeMethod merge_method, SubGraph &&sub_graph1, SubGraph &&sub_graph2, Ts &&... rest_sub_graphs)
+        : _branch_merge_method(merge_method), _sub_graphs()
+    {
+        _sub_graphs.push_back(arm_compute::support::cpp14::make_unique<SubGraph>(std::move(sub_graph1)));
+        _sub_graphs.push_back(arm_compute::support::cpp14::make_unique<SubGraph>(std::move(sub_graph2)));
+
+        for_each([&](SubGraph & sub_graph)
+        {
+            _sub_graphs.push_back(arm_compute::support::cpp14::make_unique<SubGraph>(std::move(sub_graph)));
+        },
+        std::move(rest_sub_graphs)...);
+    }
+    /** Default Constructor
+     *
+     * @param[in] sub_graph Sub graph
+     */
+    template <typename... Ts>
+    BranchLayer(SubGraph &&sub_graph)
+        : _branch_merge_method(BranchMergeMethod::DEPTH_CONCATENATE), _sub_graphs()
+    {
+        _sub_graphs.push_back(arm_compute::support::cpp14::make_unique<SubGraph>(std::move(sub_graph)));
+    }
+
+    // Inherited methods overriden:
+    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override;
+
+private:
+    BranchMergeMethod                      _branch_merge_method;
+    std::vector<std::unique_ptr<SubGraph>> _sub_graphs;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GRAPH_BRANCH_LAYER_H__ */
diff --git a/arm_compute/graph/nodes/ConvolutionLayer.h b/arm_compute/graph/nodes/ConvolutionLayer.h
index 04ba3dd6b..0905524de 100644
--- a/arm_compute/graph/nodes/ConvolutionLayer.h
+++ b/arm_compute/graph/nodes/ConvolutionLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/ITensorObject.h"
 #include "arm_compute/graph/SubTensor.h"
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/Types.h"
@@ -77,7 +78,7 @@ public:
     }
 
     // Inherited methods overriden:
-    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override;
+    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override;
 
 private:
     /** Instantiates a non-grouped convolution
diff --git a/arm_compute/graph/nodes/DepthConvertLayer.h b/arm_compute/graph/nodes/DepthConvertLayer.h
new file mode 100644
index 000000000..03bf9b7ed
--- /dev/null
+++ b/arm_compute/graph/nodes/DepthConvertLayer.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GRAPH_DEPTHCONVERT_LAYER_H__
+#define __ARM_COMPUTE_GRAPH_DEPTHCONVERT_LAYER_H__
+
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/ITensorObject.h"
+#include "arm_compute/graph/Types.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+/** DepthConvertLayer layer node */
+class DepthConvertLayer final : public INode
+{
+public:
+    /** Default constructor
+     *
+     * @param[in] policy          Convertion policy
+     * @param[in] shift           Shift value
+     * @param[in] output_datatype Output datatype
+     */
+    DepthConvertLayer(const ConvertPolicy policy, uint32_t shift, DataType output_datatype);
+
+    // Inherited methods overriden:
+    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override;
+
+private:
+    const ConvertPolicy _policy;
+    uint32_t            _shift;
+    DataType            _output_datatype;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GRAPH_DEPTHCONVERT_LAYER_H__ */
diff --git a/arm_compute/graph/nodes/DepthwiseConvolutionLayer.h b/arm_compute/graph/nodes/DepthwiseConvolutionLayer.h
new file mode 100644
index 000000000..8b7e3b829
--- /dev/null
+++ b/arm_compute/graph/nodes/DepthwiseConvolutionLayer.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GRAPH_DEPTHWISE_CONVOLUTION_LAYER_H__
+#define __ARM_COMPUTE_GRAPH_DEPTHWISE_CONVOLUTION_LAYER_H__
+
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/ITensorObject.h"
+#include "arm_compute/graph/SubTensor.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Convolution layer node */
+class DepthwiseConvolutionLayer final : public INode
+{
+public:
+    /** Default constructor
+     *
+     * @param[in] conv_width  Convolution width
+     * @param[in] conv_height Convolution height
+     * @param[in] weights     Weights values tensor
+     * @param[in] biases      Biases values tensor
+     * @param[in] conv_info   Convolution info
+     * @param[in] opt3x3      (Optional) If true executes DepthwiseConvolutionLayer3x3
+     */
+    template <typename AccessorType>
+    DepthwiseConvolutionLayer(unsigned int conv_width, unsigned int conv_height, AccessorType &&weights, AccessorType &&biases, const PadStrideInfo conv_info, bool opt3x3 = true)
+        : _conv_width(conv_width), _conv_height(conv_height), _weights(std::move(weights)), _biases(std::move(biases)), _conv_info(conv_info), _opt3x3(opt3x3)
+    {
+    }
+
+    // Inherited methods overriden:
+    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override;
+
+private:
+    unsigned int        _conv_width;
+    unsigned int        _conv_height;
+    Tensor              _weights;
+    Tensor              _biases;
+    const PadStrideInfo _conv_info;
+    bool                _opt3x3;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GRAPH_DEPTHWISE_CONVOLUTION_LAYER_H__ */
diff --git a/arm_compute/graph/nodes/DequantizationLayer.h b/arm_compute/graph/nodes/DequantizationLayer.h
new file mode 100644
index 000000000..f9b7e8af8
--- /dev/null
+++ b/arm_compute/graph/nodes/DequantizationLayer.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GRAPH_DEQUANTIZATION_LAYER_H__
+#define __ARM_COMPUTE_GRAPH_DEQUANTIZATION_LAYER_H__
+
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/ITensorObject.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/Types.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+/** DequantizationLayer layer node */
+class DequantizationLayer final : public INode
+{
+public:
+    /** Default constructor
+     *
+     * @param[in] min_max Min max value tensor
+     */
+    template <typename AccessorType>
+    DequantizationLayer(AccessorType &&min_max)
+        : _min_max(std::move(min_max))
+    {
+    }
+
+    // Inherited methods overriden:
+    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override;
+
+private:
+    Tensor _min_max;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GRAPH_DEQUANTIZATION_LAYER_H__ */
diff --git a/arm_compute/graph/nodes/FlattenLayer.h b/arm_compute/graph/nodes/FlattenLayer.h
new file mode 100644
index 000000000..c5f51a2b3
--- /dev/null
+++ b/arm_compute/graph/nodes/FlattenLayer.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GRAPH_FLATTEN_LAYER_H__
+#define __ARM_COMPUTE_GRAPH_FLATTEN_LAYER_H__
+
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/ITensorObject.h"
+#include "arm_compute/graph/Types.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Flatten layer node */
+class FlattenLayer final : public INode
+{
+public:
+    // Inherited methods overriden:
+    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GRAPH_FLATTEN_LAYER_H__ */
diff --git a/arm_compute/graph/nodes/FloorLayer.h b/arm_compute/graph/nodes/FloorLayer.h
index 40fde3b79..146e2c16d 100644
--- a/arm_compute/graph/nodes/FloorLayer.h
+++ b/arm_compute/graph/nodes/FloorLayer.h
@@ -26,18 +26,18 @@
 
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/INode.h"
-#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/ITensorObject.h"
 #include "arm_compute/graph/Types.h"
 namespace arm_compute
 {
 namespace graph
 {
 /** Floor layer node */
-class FloorLayer : public INode
+class FloorLayer final : public INode
 {
 public:
     // Inherited methods overriden:
-    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override;
+    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override;
 };
 
 } // namespace graph
diff --git a/arm_compute/graph/nodes/FullyConnectedLayer.h b/arm_compute/graph/nodes/FullyConnectedLayer.h
index d31e06045..270676a6b 100644
--- a/arm_compute/graph/nodes/FullyConnectedLayer.h
+++ b/arm_compute/graph/nodes/FullyConnectedLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/ITensorObject.h"
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/Types.h"
 
@@ -50,7 +51,7 @@ public:
     }
 
     // Inherited methods overriden:
-    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override;
+    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override;
 
     // Inherited methods overriden:
 private:
diff --git a/arm_compute/graph/nodes/L2NormalizeLayer.h b/arm_compute/graph/nodes/L2NormalizeLayer.h
index ab333a221..a423306bd 100644
--- a/arm_compute/graph/nodes/L2NormalizeLayer.h
+++ b/arm_compute/graph/nodes/L2NormalizeLayer.h
@@ -26,14 +26,14 @@
 
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/INode.h"
-#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/ITensorObject.h"
 #include "arm_compute/graph/Types.h"
 
 namespace arm_compute
 {
 namespace graph
 {
-/** L2Normalize layer node */
+/** L2NormalizeLayer layer node */
 class L2NormalizeLayer final : public INode
 {
 public:
@@ -42,13 +42,10 @@ public:
      * @param[in] axis    Dimension along which to reduce.
      * @param[in] epsilon Lower bound value for the normalization.
      */
-    explicit L2NormalizeLayer(unsigned int axis, float epsilon)
-        : _axis(axis), _epsilon(epsilon)
-    {
-    }
+    explicit L2NormalizeLayer(unsigned int axis, float epsilon);
 
     // Inherited methods overriden:
-    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override;
+    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override;
 
 private:
     unsigned int _axis;
diff --git a/arm_compute/graph/nodes/NormalizationLayer.h b/arm_compute/graph/nodes/NormalizationLayer.h
index 02efd1cbe..e1c45094d 100644
--- a/arm_compute/graph/nodes/NormalizationLayer.h
+++ b/arm_compute/graph/nodes/NormalizationLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/ITensorObject.h"
 #include "arm_compute/graph/Types.h"
 
 namespace arm_compute
@@ -43,7 +44,7 @@ public:
     explicit NormalizationLayer(const NormalizationLayerInfo norm_info);
 
     // Inherited methods overriden:
-    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override;
+    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override;
 
 private:
     const NormalizationLayerInfo _norm_info; /**< Normalization layer information */
diff --git a/arm_compute/graph/nodes/PoolingLayer.h b/arm_compute/graph/nodes/PoolingLayer.h
index 87b15d06c..5c45bc04e 100644
--- a/arm_compute/graph/nodes/PoolingLayer.h
+++ b/arm_compute/graph/nodes/PoolingLayer.h
@@ -26,7 +26,7 @@
 
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/INode.h"
-#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/ITensorObject.h"
 #include "arm_compute/graph/Types.h"
 
 namespace arm_compute
@@ -44,7 +44,7 @@ public:
     PoolingLayer(const PoolingLayerInfo pool_info);
 
     // Inherited methods overriden:
-    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override;
+    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override;
 
 private:
     const PoolingLayerInfo _pool_info; /**< Pooling layer information */
diff --git a/arm_compute/graph/nodes/QuantizationLayer.h b/arm_compute/graph/nodes/QuantizationLayer.h
new file mode 100644
index 000000000..a3ef02530
--- /dev/null
+++ b/arm_compute/graph/nodes/QuantizationLayer.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GRAPH_QUANTIZATION_LAYER_H__
+#define __ARM_COMPUTE_GRAPH_QUANTIZATION_LAYER_H__
+
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/ITensorObject.h"
+#include "arm_compute/graph/Types.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Quantization layer node */
+class QuantizationLayer final : public INode
+{
+public:
+    // Inherited methods overriden:
+    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GRAPH_QUANTIZATION_LAYER_H__ */
diff --git a/arm_compute/graph/nodes/ReshapeLayer.h b/arm_compute/graph/nodes/ReshapeLayer.h
new file mode 100644
index 000000000..b727d33a2
--- /dev/null
+++ b/arm_compute/graph/nodes/ReshapeLayer.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GRAPH_RESHAPE_LAYER_H__
+#define __ARM_COMPUTE_GRAPH_RESHAPE_LAYER_H__
+
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/ITensorObject.h"
+#include "arm_compute/graph/Types.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Reshape layer node */
+class ReshapeLayer final : public INode
+{
+public:
+    /** Default constructor
+     *
+     * @param[in] shape Output shape
+     */
+    ReshapeLayer(const TensorShape shape);
+
+    // Inherited methods overriden:
+    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override;
+
+private:
+    TensorShape _shape;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GRAPH_RESHAPE_LAYER_H__ */
diff --git a/arm_compute/graph/nodes/SoftmaxLayer.h b/arm_compute/graph/nodes/SoftmaxLayer.h
index 2e1bd98c8..b5d1bc53f 100644
--- a/arm_compute/graph/nodes/SoftmaxLayer.h
+++ b/arm_compute/graph/nodes/SoftmaxLayer.h
@@ -26,20 +26,19 @@
 
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/INode.h"
-#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/ITensorObject.h"
 #include "arm_compute/graph/Types.h"
 namespace arm_compute
 {
 namespace graph
 {
 /** Softmax layer node */
-class SoftmaxLayer : public INode
+class SoftmaxLayer final : public INode
 {
 public:
     // Inherited methods overriden:
-    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override;
+    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override;
 };
-
 } // namespace graph
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_GRAPH_SOFTMAX_LAYER_H__ */
diff --git a/arm_compute/runtime/BlobLifetimeManager.h b/arm_compute/runtime/BlobLifetimeManager.h
index ec43f47fe..edf4d4342 100644
--- a/arm_compute/runtime/BlobLifetimeManager.h
+++ b/arm_compute/runtime/BlobLifetimeManager.h
@@ -24,21 +24,20 @@
 #ifndef __ARM_COMPUTE_BLOBLIFETIMEMANAGER_H__
 #define __ARM_COMPUTE_BLOBLIFETIMEMANAGER_H__
 
-#include "arm_compute/runtime/ILifetimeManager.h"
+#include "arm_compute/runtime/ISimpleLifetimeManager.h"
 
-#include "arm_compute/runtime/IMemoryGroup.h"
+#include "arm_compute/runtime/IMemoryPool.h"
 #include "arm_compute/runtime/Types.h"
 
 #include <cstddef>
-#include <map>
+#include <memory>
 #include <vector>
 
 namespace arm_compute
 {
-class IMemoryGroup;
-
-/** Class that tracks the lifetime of registered tensors and calculates the systems memory requirements in terms of blobs */
-class BlobLifetimeManager : public ILifetimeManager
+/** Concrete class that tracks the lifetime of registered tensors and
+ *  calculates the systems memory requirements in terms of blobs */
+class BlobLifetimeManager : public ISimpleLifetimeManager
 {
 public:
     /** Constructor */
@@ -53,35 +52,15 @@ public:
     BlobLifetimeManager &operator=(BlobLifetimeManager &&) = default;
 
     // Inherited methods overridden:
-    void register_group(IMemoryGroup *group) override;
-    void start_lifetime(void *obj) override;
-    void end_lifetime(void *obj, void **handle, size_t size) override;
     std::unique_ptr<IMemoryPool> create_pool(IAllocator *allocator) override;
-    bool        are_all_finalized() const override;
     MappingType mapping_type() const override;
 
 private:
-    /** Update blobs and mappings */
-    void update_blobs_and_mappings();
+    // Inherited methods overridden:
+    void update_blobs_and_mappings() override;
 
 private:
-    /** Element struct */
-    struct Element
-    {
-        Element(void *id_ = nullptr, void **handle_ = nullptr, size_t size_ = 0, bool status_ = false)
-            : id(id_), handle(handle_), size(size_), status(status_)
-        {
-        }
-        void *id;      /**< Element id */
-        void **handle; /**< Element's memory handle */
-        size_t size;   /**< Element's size */
-        bool   status; /**< Lifetime status */
-    };
-
-    IMemoryGroup        *_active_group;                               /**< Active group */
-    std::vector<Element> _active_elements;                            /**< A map that contains the active elements */
-    std::map<IMemoryGroup *, std::vector<Element>> _finalized_groups; /**< A map that contains the finalized groups */
-    std::vector<size_t> _blobs;
+    std::vector<size_t> _blobs; /**< Memory blobs' sizes */
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_BLOBLIFETIMEMANAGER_H__ */
diff --git a/arm_compute/runtime/BlobMemoryPool.h b/arm_compute/runtime/BlobMemoryPool.h
index f703bf0b8..25bfd539f 100644
--- a/arm_compute/runtime/BlobMemoryPool.h
+++ b/arm_compute/runtime/BlobMemoryPool.h
@@ -79,5 +79,5 @@ private:
     std::vector<void *> _blobs;      /**< Vector holding all the memory blobs */
     std::vector<size_t> _blob_sizes; /**< Sizes of each blob */
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_BLOBMEMORYPOOL_H__ */
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 360372d19..f6ecef7a5 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -42,9 +42,9 @@
 #include "arm_compute/runtime/CL/functions/CLColorConvert.h"
 #include "arm_compute/runtime/CL/functions/CLConvolution.h"
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
-#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDerivative.h"
@@ -59,7 +59,8 @@
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowp.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
@@ -71,7 +72,7 @@
 #include "arm_compute/runtime/CL/functions/CLHarrisCorners.h"
 #include "arm_compute/runtime/CL/functions/CLHistogram.h"
 #include "arm_compute/runtime/CL/functions/CLIntegralImage.h"
-#include "arm_compute/runtime/CL/functions/CLL2Normalize.h"
+#include "arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h"
 #include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
 #include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
 #include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
diff --git a/arm_compute/runtime/CL/CLMultiImage.h b/arm_compute/runtime/CL/CLMultiImage.h
index f70929db0..2c2b4709b 100644
--- a/arm_compute/runtime/CL/CLMultiImage.h
+++ b/arm_compute/runtime/CL/CLMultiImage.h
@@ -44,18 +44,18 @@ public:
     CLMultiImage();
     /** Init the multi-planar image
      *
-     *  @param[in] width  Width of the whole image
-     *  @param[in] height Heigth of the whole image
-     *  @param[in] format Format of the whole image
+     * @param[in] width  Width of the whole image
+     * @param[in] height Heigth of the whole image
+     * @param[in] format Format of the whole image
      */
     void init(unsigned int width, unsigned int height, Format format);
     /** Init the multi-planar image
      *
      * @note Uses conservative padding strategy which fits all kernels.
      *
-     *  @param[in] width  Width of the whole image
-     *  @param[in] height Height of the whole image
-     *  @param[in] format Format of the whole image
+     * @param[in] width  Width of the whole image
+     * @param[in] height Height of the whole image
+     * @param[in] format Format of the whole image
      */
     void init_auto_padding(unsigned int width, unsigned int height, Format format);
     /** Allocated a previously initialised multi image
@@ -73,10 +73,10 @@ public:
 private:
     /** Init the multi-planar image
      *
-     *  @param[in] width        Width of the whole image
-     *  @param[in] height       Height of the whole image
-     *  @param[in] format       Format of the whole image
-     *  @param[in] auto_padding Specifies whether the image uses auto padding
+     * @param[in] width        Width of the whole image
+     * @param[in] height       Height of the whole image
+     * @param[in] format       Format of the whole image
+     * @param[in] auto_padding Specifies whether the image uses auto padding
      */
     void internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding);
 
diff --git a/arm_compute/runtime/CL/functions/CLActivationLayer.h b/arm_compute/runtime/CL/functions/CLActivationLayer.h
index a1aeb193d..5b99abc5f 100644
--- a/arm_compute/runtime/CL/functions/CLActivationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLActivationLayer.h
@@ -49,6 +49,16 @@ public:
      * @param[in]      act_info Activation layer parameters.
      */
     void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayer
+     *
+     * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
+     *                     of the activation function. Data types supported: QS8/QS16/F16/F32.
+     * @param[in] output   Destination tensor info. Data type supported: same as @p input
+     * @param[in] act_info Activation layer information.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
 };
 }
 #endif /* __ARM_COMPUTE_CLACTIVATIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLArithmeticAddition.h b/arm_compute/runtime/CL/functions/CLArithmeticAddition.h
index f888256b3..1ef3e274c 100644
--- a/arm_compute/runtime/CL/functions/CLArithmeticAddition.h
+++ b/arm_compute/runtime/CL/functions/CLArithmeticAddition.h
@@ -47,6 +47,16 @@ public:
      * @param[in]  policy Policy to use to handle overflow.
      */
     void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticAddition
+     *
+     * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32.
+     * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32.
+     * @param[in] policy Policy to use to handle overflow.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
 };
 }
 #endif /* __ARM_COMPUTE_CLARITHMETICADDITION_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h b/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h
index eedeaa8d0..0d3f5bce6 100644
--- a/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h
+++ b/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h
@@ -48,6 +48,16 @@ public:
      * @param[in]  policy Policy to use to handle overflow.
      */
     void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticSubtraction
+     *
+     * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32.
+     * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32.
+     * @param[in] policy Policy to use to handle overflow.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
 };
 }
 #endif /* __ARM_COMPUTE_CLARITHMETICSUBTRACTION_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
index ffb66bee6..127de1055 100644
--- a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
@@ -51,14 +51,32 @@ public:
      * @param[in, out] input   Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
      *                         3 lower dimensions represent a single input with dimensions [width, height, FM].
      *                         The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32.
+     * @param[out]     output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
      * @param[in]      mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
      * @param[in]      var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]      gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
      * @param[in]      beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]      gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
      * @param[in]      epsilon Small value to avoid division with zero.
-     * @param[out]     output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
      */
     void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayer
+     *
+     * @param[in] input   Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
+     *                    3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                    The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32.
+     * @param[in] output  Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     * @param[in] mean    Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] var     Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] beta    Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] gamma   Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] epsilon Small value to avoid division with zero.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                           const ITensorInfo *mean, const ITensorInfo *var,
+                           const ITensorInfo *beta, const ITensorInfo *gamma,
+                           float epsilon);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLChannelExtract.h b/arm_compute/runtime/CL/functions/CLChannelExtract.h
index 175337462..9182feffc 100644
--- a/arm_compute/runtime/CL/functions/CLChannelExtract.h
+++ b/arm_compute/runtime/CL/functions/CLChannelExtract.h
@@ -39,14 +39,14 @@ class CLChannelExtract : public ICLSimpleFunction
 public:
     /** Initialize the function's source, destination
      *
-     * @param[in]  input   The input tensor to extract the channel from. Formats supported: Any single planar.
+     * @param[in]  input   The input tensor to extract the channel from. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
      * @param[in]  channel The channel to extract.
      * @param[out] output  The extracted channel. Must be of U8 format.
      */
     void configure(const ICLTensor *input, Channel channel, ICLTensor *output);
     /** Initialize the function's source, destination
      *
-     * @param[in]  input   The multi-planar input image to extract channel from.
+     * @param[in]  input   The multi-planar input image to extract channel from. Formats supported: NV12/NV21/IYUV/YUV444
      * @param[in]  channel The channel to extract.
      * @param[out] output  The extracted 2D channel. Must be of U8 format.
      */
diff --git a/arm_compute/runtime/CL/functions/CLColorConvert.h b/arm_compute/runtime/CL/functions/CLColorConvert.h
index 12457a0cf..dd7de4547 100644
--- a/arm_compute/runtime/CL/functions/CLColorConvert.h
+++ b/arm_compute/runtime/CL/functions/CLColorConvert.h
@@ -41,26 +41,27 @@ class CLColorConvert : public ICLSimpleFunction
 public:
     /** Initialize the function's source, destination
      *
-     * @param[in] input  The input single-planar tensor from which to convert
-     * @param[in] output The converted single-planar output tensor
+     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
+     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
+     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/)
      */
     void configure(const ICLTensor *input, ICLTensor *output);
     /** Initialize the function's source, destination
      *
-     * @param[in] input  The multi-planar input image from which to convert
-     * @param[in] output The converted single-planar output image
+     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
      */
     void configure(const ICLMultiImage *input, ICLImage *output);
     /** Initialize the function's source, destination
      *
-     * @param[in] input  The single-planar input image from which to convert
-     * @param[in] output The converted multi-planar output image
+     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
+     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
      */
     void configure(const ICLImage *input, ICLMultiImage *output);
     /** Initialize the function's source, destination
      *
-     * @param[in] input  The multi-planar input image from which to convert
-     * @param[in] output The converted multi-planar output image
+     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
      */
     void configure(const ICLMultiImage *input, ICLMultiImage *output);
 };
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index cd1ea70a2..a8a04a0bb 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -36,6 +36,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 
 #include <memory>
@@ -55,7 +57,8 @@ public:
     CLConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QS8/QS16/F16/F32.
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                          Data type supported: QS8/QASYMM8/QS16/F16/F32.
      * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
      * @param[out] output       Destination tensor. Data types supported: Same as @p weights.
      * @param[in]  transpose1xW True if the weights are to undergo a 1xW transposition after reshaping (in case of GEMM operation), false otherwise.
@@ -79,7 +82,8 @@ private:
  * -# @ref CLGEMMTranspose1xWKernel (executed only once for each configuration)
  * -# @ref CLIm2ColKernel
  * -# @ref CLGEMMInterleave4x4Kernel
- * -# @ref CLGEMMMatrixMultiplyKernel
+ * -# @ref CLGEMMMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
+ * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8Scale (if quantized asymmetric)
  * -# @ref CLCol2ImKernel
  */
 class CLConvolutionLayer : public IFunction
@@ -91,14 +95,15 @@ public:
      *
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                          while every optional dimension from 4 and above represent a batch of inputs.
-     *                          Data types supported: QS8/QS16/F16/F32.
-     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
+     *                          Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
+     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                          Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
      * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                          Data types supported: Same as @p input.
      * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                          tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
+     * @param[in]  weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                          tensor has also been transposed with CLGEMMTranspose1xWKernel. Data type supported: Same as @p input.
      */
     void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo());
 
@@ -106,20 +111,37 @@ public:
     void run() override;
 
 private:
-    CLMemoryGroup                    _memory_group;
-    CLConvolutionLayerReshapeWeights _reshape_weights;
-    CLIm2ColKernel                   _input_im2col_kernel;
-    CLGEMMInterleave4x4Kernel        _input_interleave_kernel;
-    CLGEMMMatrixMultiplyKernel       _mm_kernel;
-    CLCol2ImKernel                   _output_col2im_kernel;
-    CLTensor                         _input_im2col_reshaped;
-    CLTensor                         _input_interleaved_reshaped;
-    CLTensor                         _weights_reshaped;
-    CLTensor                         _weights_transposed;
-    CLTensor                         _gemm_output;
-    bool                             _has_bias;
-    bool                             _is_fully_connected_convolution;
-    bool                             _are_weights_reshaped;
+    /** Configures the appropriate matrix multiply routine
+     *
+     * @param input                     Input tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     * @param weights                   Weights tensor. Data type supported: Same as @p input.
+     * @param output                    Output tensor. Data types supported: Same as @p input,
+     *                                                 except for input of QASYMM8 type where output should be of S32 type.
+     * @param is_interleaved_transposed Flag that signals if matrix is interleaved transposed
+     */
+    void configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool is_interleaved_transposed = true);
+
+private:
+    CLMemoryGroup                                       _memory_group;
+    CLConvolutionLayerReshapeWeights                    _reshape_weights;
+    CLIm2ColKernel                                      _input_im2col_kernel;
+    CLGEMMInterleave4x4Kernel                           _input_interleave_kernel;
+    CLGEMMMatrixMultiplyKernel                          _mm_kernel;
+    CLGEMMLowpMatrixMultiplyCore                        _mm_gemmlowp;
+    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage;
+    CLCol2ImKernel                                      _output_col2im_kernel;
+
+    CLTensor _input_im2col_reshaped;
+    CLTensor _input_interleaved_reshaped;
+    CLTensor _weights_reshaped;
+    CLTensor _weights_transposed;
+    CLTensor _gemm_output;
+    CLTensor _tmp_output;
+
+    bool _append_bias;
+    bool _is_fully_connected_convolution;
+    bool _are_weights_reshaped;
+    bool _is_quantized;
 };
 }
 #endif /* __ARM_COMPUTE_CLCONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDepthConcatenate.h b/arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h
index 77997f6bd..00b3b66c9 100644
--- a/arm_compute/runtime/CL/functions/CLDepthConcatenate.h
+++ b/arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h
@@ -29,7 +29,7 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 
 #include <memory>
@@ -42,14 +42,14 @@ class ICLTensor;
 /** Basic function to execute concatenate tensors along z axis. This function calls the following kernels:
  *
  * -# @ref CLFillBorderKernel (executed if input's lowest two dimensions are smaller than respective output's dimensions)
- * -# @ref CLDepthConcatenateKernel
+ * -# @ref CLDepthConcatenateLayerKernel
  *
  */
-class CLDepthConcatenate : public IFunction
+class CLDepthConcatenateLayer : public IFunction
 {
 public:
     /** Default constructor */
-    CLDepthConcatenate();
+    CLDepthConcatenateLayer();
     /** Initialise the kernel's inputs vector and output.
      *
      * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QS8/QS16/F16/F32.
@@ -61,10 +61,10 @@ public:
     void run() override;
 
 private:
-    std::vector<ICLTensor *>                    _inputs_vector;
-    std::unique_ptr<CLDepthConcatenateKernel[]> _concat_kernels_vector;
-    std::unique_ptr<CLFillBorderKernel[]>       _border_handlers_vector;
-    unsigned int                                _num_inputs;
+    std::vector<ICLTensor *>                         _inputs_vector;
+    std::unique_ptr<CLDepthConcatenateLayerKernel[]> _concat_kernels_vector;
+    std::unique_ptr<CLFillBorderKernel[]>            _border_handlers_vector;
+    unsigned int                                     _num_inputs;
 };
 }
 #endif /* __ARM_COMPUTE_CLDEPTHCONCATENATE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDepthConvert.h b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
index 9a4c63dd6..c84dc1550 100644
--- a/arm_compute/runtime/CL/functions/CLDepthConvert.h
+++ b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
@@ -33,8 +33,8 @@ namespace arm_compute
 {
 class ICLTensor;
 
-/** Basic function to run @ref CLDepthConvertKernel. */
-class CLDepthConvert : public ICLSimpleFunction
+/** Basic function to run @ref CLDepthConvertLayerKernel. */
+class CLDepthConvertLayer : public ICLSimpleFunction
 {
 public:
     /** Initialize the function's source, destination
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
index 53bc079cb..f7899415d 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
@@ -24,7 +24,7 @@
 #ifndef __ARM_COMPUTE_CLDEPTHWISECONVOLUTION_H__
 #define __ARM_COMPUTE_CLDEPTHWISECONVOLUTION_H__
 
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseVectorToTensorKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h"
@@ -40,30 +40,32 @@ class ICLTensor;
 
 /** Basic function to execute a depthwise convolution for kernel size 3x3xC. This function calls the following OpenCL kernels:
  *
- * -# @ref CLDepthwiseConvolution3x3Kernel
+ * -# @ref CLDepthwiseConvolutionLayer3x3Kernel
  * -# @ref CLFillBorderKernel (if pad_x or pad_y > 0)
  *
  */
-class CLDepthwiseConvolution3x3 : public IFunction
+class CLDepthwiseConvolutionLayer3x3 : public IFunction
 {
 public:
     /** Default constructor */
-    CLDepthwiseConvolution3x3();
+    CLDepthwiseConvolutionLayer3x3();
     /** Initialize the function's source, destination, conv and border_size.
      *
-     * @param[in, out] input     Source tensor. Data type supported: F32. (Written to only for border filling).
+     * @param[in, out] input     Source tensor. Data type supported: QASYMM8/F32. (Written to only for border filling).
+     * @param[in]      weights   Weights tensor. A 3D tensor with shape [3, 3, IFM]. Data type supported: Same as @p input.
+     * @param[in]      biases    (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                           Data type supported: Same as @p input.
      * @param[out]     output    Destination tensor. Data type supported: same as @p input.
-     * @param[in]      weights   Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input.
      * @param[in]      conv_info Padding and stride information to use for the convolution.
      */
-    void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info);
+    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
 
     // Inherited methods overriden:
     void run() override;
 
 private:
-    CLDepthwiseConvolution3x3Kernel _kernel;
-    CLFillBorderKernel              _border_handler;
+    CLDepthwiseConvolutionLayer3x3Kernel _kernel;
+    CLFillBorderKernel                   _border_handler;
 };
 
 /** Basic function to execute a generic depthwise convolution. This function calls the following OpenCL kernels:
@@ -74,19 +76,21 @@ private:
  * -# @ref CLFillBorderKernel (if pad_x or pad_y > 0)
  *
  */
-class CLDepthwiseConvolution : public IFunction
+class CLDepthwiseConvolutionLayer : public IFunction
 {
 public:
     /** Default constructor */
-    CLDepthwiseConvolution();
+    CLDepthwiseConvolutionLayer();
     /** Initialize the function's source, destination, weights and convolution information.
      *
      * @param[in, out] input     Source tensor. Data type supported: F32. (Written to only for border filling).
-     * @param[out]     output    Destination tensor. Data type supported: same as @p input.
      * @param[in]      weights   Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+     * @param[in]      biases    (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                           Data type supported: Same as @p input.
+     * @param[out]     output    Destination tensor. Data type supported: same as @p input.
      * @param[in]      conv_info Padding and stride information to use for the convolution.
      */
-    void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info);
+    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
 
     // Inherited methods overriden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h
index 7dabed181..27cee5ed3 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h
@@ -27,7 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h"
+#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
 #include "arm_compute/runtime/IFunction.h"
 
@@ -39,7 +39,7 @@ class ICLTensor;
 
 /** Basic function to execute depthwise convolution. This function calls the following OpenCL kernels and function:
  *
- * -# @ref CLDepthwiseConvolution
+ * -# @ref CLDepthwiseConvolutionLayer
  * -# @ref CLDirectConvolutionLayer
  *
  */
@@ -53,24 +53,27 @@ public:
      * @param[in]  input               Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                                 while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F32.
      * @param[in]  depthwise_weights   Depthwise convolution weights tensor. These are 3D tensors with dimensions [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+     * @param[in]  depthwise_biases    (Optional) Biases tensor.Biases are 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                                 Data type supported: Same as @p weights.
      * @param[out] depthwise_out       Depthwise destination tensor.
      * @param[in]  pointwise_weights   Pointwise convolution weights tensor. These are 4D tensors with dimensions [1, 1, IFM, OFM]. Data type supported: Same as @p input.
-     * @param[in]  biases              Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
+     * @param[in]  pointwise_biases    (Optional) Biases tensor. Biases are 1D tensor with dimensions [OFM]. Must be nullptr if not needed.
+     *                                 Data type supported: Same as @p weights.
      * @param[out] output              Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                                 Data types supported: Same as @p input.
      * @param[in]  depthwise_conv_info Contains padding and stride information described in @ref PadStrideInfo for depthwise convolution.
      * @param[in]  pointwise_conv_info Contains padding and stride information described in @ref PadStrideInfo for pointwise convolution.
      */
-    void configure(ICLTensor *input, const ICLTensor *depthwise_weights, ICLTensor *depthwise_out, const ICLTensor *pointwise_weights, const ICLTensor *biases, ICLTensor *output,
-                   const PadStrideInfo &depthwise_conv_info,
-                   const PadStrideInfo &pointwise_conv_info);
+    void configure(ICLTensor *input, const ICLTensor *depthwise_weights, const ICLTensor *depthwise_biases, ICLTensor *depthwise_out,
+                   const ICLTensor *pointwise_weights, const ICLTensor *pointwise_biases, ICLTensor *output,
+                   const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info);
 
     // Inherited methods overriden:
     void run() override;
 
 private:
-    CLDepthwiseConvolution   _depthwise_conv;
-    CLDirectConvolutionLayer _pointwise_conv;
+    CLDepthwiseConvolutionLayer _depthwise_conv;
+    CLDirectConvolutionLayer    _pointwise_conv;
 };
 }
 #endif /*__ARM_COMPUTE_CL_DEPTHWISE_SEPARABLE_CONVOLUTION_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDilate.h b/arm_compute/runtime/CL/functions/CLDilate.h
index 8534139c8..e4173ac51 100644
--- a/arm_compute/runtime/CL/functions/CLDilate.h
+++ b/arm_compute/runtime/CL/functions/CLDilate.h
@@ -49,7 +49,7 @@ public:
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
+    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
 };
 }
 #endif /*__ARM_COMPUTE_CLDILATE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
index 4c85277c0..f31a45be9 100644
--- a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
@@ -45,14 +45,29 @@ public:
      *
      * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                       while every optional dimension from 4 and above represent a batch of inputs.
-     *                       Data types supported: QS8/QS16/F16/F32.
+     *                       Data types supported: QASYMM8/QS8/QS16/F16/F32.
      * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
+     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                       Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
      * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                       Data types supported: Same as @p input.
      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
      */
     void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayer
+     *
+     * @param[in] input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                      while every optional dimension from 4 and above represent a batch of inputs.
+     *                      Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in] biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
+     * @param[in] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                      Data types supported: Same as @p input.
+     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLErode.h b/arm_compute/runtime/CL/functions/CLErode.h
index cd2f5516e..a929cc9ba 100644
--- a/arm_compute/runtime/CL/functions/CLErode.h
+++ b/arm_compute/runtime/CL/functions/CLErode.h
@@ -49,7 +49,7 @@ public:
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
+    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
 };
 }
 #endif /*__ARM_COMPUTE_CLERODE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
index f71e2a33f..2cac06c1c 100644
--- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
@@ -32,6 +32,8 @@
 #include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
 #include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 
 namespace arm_compute
 {
@@ -46,7 +48,7 @@ class CLFullyConnectedLayerReshapeWeights : public ICLSimpleFunction
 public:
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/QS16/F16/F32.
+     * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/QASYMM8/QS16/F16/F32.
      * @param[out] output Destination tensor which stores the transposed input tensor. Data type supported: Same as @p input.
      */
     void configure(const ICLTensor *input, ICLTensor *output);
@@ -56,8 +58,8 @@ public:
  *
  *  -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer)
  *  -# @ref CLFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
- *  -# @ref CLGEMMMatrixMultiplyKernel
- *  -# @ref CLGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr)
+ *  -# @ref CLGEMMMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
+ *  -# @ref CLGEMMMatrixAccumulateBiasesKernel or @ref CLGEMMLowpQuantizeDownInt32ToUint8Scale (if quantized asymmetric) (if @p biases is not equal to nullptr)
  *
  * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
  */
@@ -68,7 +70,7 @@ public:
     CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  input                Source tensor. Data type supported: QS8/QS16/F16/F32.
+     * @param[in]  input                Source tensor. Data type supported: QS8/QASYMM8/QS16/F16/F32.
      * @param[in]  weights              Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input
      * @param[in]  biases               Bias tensor. It can be nullptr. Data type supported:Same as @p input.
      * @param[out] output               Destination tensor. Data type supported: Same as @p input.
@@ -83,17 +85,22 @@ public:
 private:
     void configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
     void configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
+    void configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool is_interleaved_transposed = true);
 
-    CLMemoryGroup                       _memory_group;
-    CLIm2ColKernel                      _im2col_kernel;
-    CLFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
-    CLGEMMMatrixMultiplyKernel          _mm_kernel;
-    CLGEMMMatrixAccumulateBiasesKernel  _accumulate_biases_kernel;
-    CLTensor                            _im2col_output;
-    CLTensor                            _reshape_weights_output;
-    bool                                _are_weights_reshaped;
-    bool                                _is_fc_after_conv;
-    bool                                _accumulate_biases;
+    CLMemoryGroup                                       _memory_group;
+    CLIm2ColKernel                                      _im2col_kernel;
+    CLFullyConnectedLayerReshapeWeights                 _reshape_weights_kernel;
+    CLGEMMMatrixMultiplyKernel                          _mm_kernel;
+    CLGEMMLowpMatrixMultiplyCore                        _mm_gemmlowp;
+    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage;
+    CLGEMMMatrixAccumulateBiasesKernel                  _accumulate_biases_kernel;
+    CLTensor                                            _im2col_output;
+    CLTensor                                            _gemmlowp_output;
+    CLTensor                                            _reshape_weights_output;
+    bool                                                _are_weights_reshaped;
+    bool                                                _is_fc_after_conv;
+    bool                                                _accumulate_biases;
+    bool                                                _is_quantized;
 };
 }
 #endif /* __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h b/arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h
index 8c755aeab..ae05b0fd9 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h
@@ -40,11 +40,11 @@ class CLGEMMInterleave4x4 : public ICLSimpleFunction
 public:
     /** Initialise the kernel's inputs, output
      *
-     * @param[in]  input  First input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  First input tensor. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: same as @p input
      */
     void configure(const ICLTensor *input, ICLTensor *output);
 };
 }
 
-#endif /* __ARM_COMPUTE_CLGEMMINTERLEAVE4X4_H__ */
-\ No newline at end of file
+#endif /* __ARM_COMPUTE_CLGEMMINTERLEAVE4X4_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowp.h b/arm_compute/runtime/CL/functions/CLGEMMLowp.h
deleted file mode 100644
index 613fcaa7e..000000000
--- a/arm_compute/runtime/CL/functions/CLGEMMLowp.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLGEMMLOWP_H__
-#define __ARM_COMPUTE_CLGEMMLOWP_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
-#include "arm_compute/runtime/CL/CLMemoryGroup.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute GEMMLowp on OpenCL. This function calls the following OpenCL kernels:
-*
-*  -# @ref CLGEMMInterleave4x4Kernel
-*  -# @ref CLGEMMTranspose1xWKernel
-*  -# @ref CLGEMMLowpMatrixMultiplyKernel
-*
-*/
-class CLGEMMLowp : public IFunction
-{
-public:
-    /** Constructor */
-    CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialise the kernel's inputs, output
-    *
-    * @note GEMM_LOWP:  low precision matrix multiply kernel
-    *  This kernel performs the following computation:
-    *
-    *  -# Convert a values from uint8 to int32 and add a_offset to each of them.
-    *  -# Convert b values from uint8 to int32 and add b_offset to each of them.
-    *  -# Compute the int32 matrix product of the resulting a * b.
-    *  -# Add output_offset to each entry of the result.
-    *  -# Multiply each entry of the result and round to the nearest integer
-    *  -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
-    *
-    * @param[in]  a               First input tensor  (Matrix A). Data types supported: U8.
-    * @param[in]  b               Second input tensor (Matrix B). Data types supported: same as @p a.
-    * @param[out] output          Output tensor. Data types supported: same as @p a.
-    * @param[in]  a_offset        Offset to be added to each element of the matrix A.
-    * @param[in]  b_offset        Offset to be added to each element of the matrix B.
-    * @param[in]  output_offset   Offset to be added to each element of the output matrix
-    * @param[in]  output_mult_int Multiplied with each element of the output matrix
-    * @param[in]  shift           Number of bits to shift right the result.
-    */
-    void configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    CLMemoryGroup                  _memory_group;
-    CLGEMMInterleave4x4Kernel      _interleave_kernel;
-    CLGEMMTranspose1xWKernel       _transpose_kernel;
-    CLGEMMLowpMatrixMultiplyKernel _mm_kernel;
-    CLTensor                       _tmp_a;
-    CLTensor                       _tmp_b;
-};
-}
-#endif /*__ARM_COMPUTE_CLGEMMLOWP_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
new file mode 100644
index 000000000..e31614454
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__
+#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__
+
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class IMemoryManager;
+class ICLTensor;
+
+/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. This function calls the following OpenCL kernels:
+ *
+ *  -# @ref CLGEMMInterleave4x4Kernel  (if the output tensor is a matrix)
+ *  -# @ref CLGEMMTranspose1xWKernel  (if the output tensor is a matrix)
+ *  -# @ref CLGEMMLowpMatrixMultiplyKernel
+ *  -# @ref CLGEMMLowpMatrixAReductionKernel (if the offset of matrix B is not 0)
+ *  -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0)
+ *  -# @ref CLGEMMLowpOffsetContributionKernel
+ *
+*/
+class CLGEMMLowpMatrixMultiplyCore : public IFunction
+{
+public:
+    /** Constructor */
+    CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Initialise the kernel's inputs, output
+     *
+     * @note GEMM_LOWP:  low precision GEMM kernel
+     *  This kernel performs the following computations:
+     *
+     *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
+     *  -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
+     *  -# Compute the matrix product of the resulting a * b in int32.
+     *
+     * @param[in]  a         First input tensor  (Matrix A). Data type supported: QASYMM8.
+     * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a
+     * @param[out] output    Output tensor. Data type supported: Data type supported: S32
+     * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
+     *                       if the reshape of matrix B should be executed only for the first run
+     */
+    void configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLMemoryGroup                      _memory_group;
+    CLGEMMLowpMatrixMultiplyKernel     _mm_kernel;
+    CLGEMMInterleave4x4Kernel          _mtx_a_reshape_kernel;
+    CLGEMMTranspose1xWKernel           _mtx_b_reshape_kernel;
+    CLGEMMLowpMatrixAReductionKernel   _mtx_a_reduction_kernel;
+    CLGEMMLowpMatrixBReductionKernel   _mtx_b_reduction_kernel;
+    CLGEMMLowpOffsetContributionKernel _offset_contribution_kernel;
+    CLTensor                           _vector_sum_col;
+    CLTensor                           _vector_sum_row;
+    CLTensor                           _tmp_a;
+    CLTensor                           _tmp_b;
+    int32_t                            _a_offset;
+    int32_t                            _b_offset;
+    bool                               _is_interleaved_transposed;
+    bool                               _is_first_run;
+    bool                               _reshape_b_only_on_first_run;
+};
+}
+#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
new file mode 100644
index 000000000..7446ff4b3
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H__
+#define __ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+/** This file contains all available output stages for GEMMLowp on OpenCL.
+ *
+ *  In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyCore),
+ *  and processes it to obtain the final ASYMM8 value.
+ *
+ *  More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md
+ */
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8Scale on OpenCL.
+ *
+ *  CLGEMMLowpQuantizeDownInt32ToUint8Scale depends on 3 parameters: result_offset, result_mult_int, result_shift
+ *  The final result is:
+ *
+ *  ((input[i][k] + result_offset) * result_mult_int) >> result_shift
+ *
+ * In case the bias tensor is provided, the final result is:
+ *
+ *  ((input[i][k] + bias[k] + result_offset) * result_mult_int) >> result_shift
+ *
+ *  This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel
+ *
+ * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
+ *       after the result is shifted right by result_shift
+*/
+class CLGEMMLowpQuantizeDownInt32ToUint8Scale : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output
+     *
+     * @param[in]  input           Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in]  result_offset   Offset to be added to each element of the input matrix
+     * @param[in]  result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift    Number of bits to shift right the result before converting back to QASYMM8
+     * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     */
+    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8Scale
+     *
+     * @param[in] input  Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
+     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+};
+
+/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on OpenCL.
+ *
+ *  CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters:
+ *
+ *  result_fixedpoint_multiplier, result_shift, result_offset_after_shift
+ *
+ * The final result is:
+ *
+ * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
+ *
+ * where FixedPointMul(x, y) is the nearest integer to the following
+ * mathematical expression, evaluated without overflow or intermediate rounding:
+ *
+ * (x * y) / 2^31
+ *
+ * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
+ *
+ * In case the bias tensor is provided, the final result is:
+ *
+ * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
+ *
+ *  This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+ *
+ * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
+ *       after the result is shifted right by result_shift
+*/
+class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output
+     *
+     * @param[in]  input                        Input tensor. Data type supported: S32
+     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
+     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
+     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     */
+    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
+     *
+     * @param[in] input  Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
+     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H__ */
+\ No newline at end of file
diff --git a/arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h b/arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h
index 866c17b51..ae56548c2 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h
@@ -38,7 +38,7 @@ class CLGEMMTranspose1xW : public ICLSimpleFunction
 public:
     /** Initialise the kernel's inputs, output
      *
-     * @param[in]  input  First input tensor. Data type supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32/
+     * @param[in]  input  First input tensor. Data type supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: same as @p input
      */
     void configure(const ICLTensor *input, ICLTensor *output);
diff --git a/arm_compute/runtime/CL/functions/CLHOGDetector.h b/arm_compute/runtime/CL/functions/CLHOGDetector.h
index 0b4fad776..158783693 100644
--- a/arm_compute/runtime/CL/functions/CLHOGDetector.h
+++ b/arm_compute/runtime/CL/functions/CLHOGDetector.h
@@ -55,7 +55,7 @@ public:
      *
      * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it.
      *
-     * @param[in]  input                   Input tensor. It is the output of @ref NEHOGDescriptor. Data type supported: F32
+     * @param[in]  input                   Input tensor. It is the output of @ref CLHOGDescriptor. Data type supported: F32
      * @param[in]  hog                     HOG data-object that describes the HOG descriptor
      * @param[out] detection_windows       Array of @ref DetectionWindow used to store the detected objects
      * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
diff --git a/arm_compute/runtime/CL/functions/CLIntegralImage.h b/arm_compute/runtime/CL/functions/CLIntegralImage.h
index 25fc549b2..71f6897d1 100644
--- a/arm_compute/runtime/CL/functions/CLIntegralImage.h
+++ b/arm_compute/runtime/CL/functions/CLIntegralImage.h
@@ -43,10 +43,10 @@ public:
     /** Default Constructor. */
     CLIntegralImage();
     /** Initialise the function's source, destinations and border mode.
-    *
-    * @param[in]  input  Source tensor. Data types supported: U8.
-    * @param[out] output Destination tensor, Data types supported: U32.
-    */
+     *
+     * @param[in]  input  Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor, Data types supported: U32.
+     */
     void configure(const ICLTensor *input, ICLTensor *output);
 
     // Inherited methods overridden:
diff --git a/arm_compute/runtime/CL/functions/CLL2Normalize.h b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
index 20af54eda..8aea7a641 100644
--- a/arm_compute/runtime/CL/functions/CLL2Normalize.h
+++ b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
@@ -24,7 +24,7 @@
 #ifndef __ARM_COMPUTE_CLL2NORMALIZE_H__
 #define __ARM_COMPUTE_CLL2NORMALIZE_H__
 
-#include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h"
+#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
@@ -41,11 +41,11 @@ class ICLTensor;
 
 /** Perform reduction operation.
  */
-class CLL2Normalize : public IFunction
+class CLL2NormalizeLayer : public IFunction
 {
 public:
     /** Constructor */
-    CLL2Normalize(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
 
     /** Set the input and output tensors.
      *
@@ -60,10 +60,10 @@ public:
     void run() override;
 
 private:
-    CLMemoryGroup        _memory_group;
-    CLReductionOperation _reduce_func;
-    CLL2NormalizeKernel  _normalize_kernel;
-    CLTensor             _sumsq;
+    CLMemoryGroup            _memory_group;
+    CLReductionOperation     _reduce_func;
+    CLL2NormalizeLayerKernel _normalize_kernel;
+    CLTensor                 _sumsq;
 };
 }
 #endif /*__ARM_COMPUTE_CLL2NORMALIZE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h b/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h
index 0c6708aa7..585a013e3 100644
--- a/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h
+++ b/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h
@@ -27,7 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLPyramid.h"
 #include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
 #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -77,7 +77,7 @@ private:
     CLGaussianPyramidHalf                      _gaussian_pyr_function;
     std::unique_ptr<CLGaussian5x5[]>           _convf;
     std::unique_ptr<CLArithmeticSubtraction[]> _subf;
-    CLDepthConvert                             _depth_function;
+    CLDepthConvertLayer                        _depth_function;
     CLPyramid                                  _gauss_pyr;
     CLPyramid                                  _conv_pyr;
 };
diff --git a/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h b/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h
index 4bc7eb65c..4a676c85a 100644
--- a/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h
+++ b/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h
@@ -27,7 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLPyramid.h"
 #include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
 #include "arm_compute/runtime/CL/functions/CLScale.h"
 #include "arm_compute/runtime/IFunction.h"
 
@@ -43,7 +43,7 @@ using ICLImage = ICLTensor;
  *
  * -# @ref CLArithmeticAddition
  * -# @ref CLScale
- * -# @ref CLDepthConvert
+ * -# @ref CLDepthConvertLayer
  *
  * This function reconstructs the original image from a Laplacian Image Pyramid.
  *
@@ -85,7 +85,7 @@ private:
     CLPyramid                               _tmp_pyr;
     std::unique_ptr<CLArithmeticAddition[]> _addf;
     std::unique_ptr<CLScale[]>              _scalef;
-    CLDepthConvert                          _depthf;
+    CLDepthConvertLayer                     _depthf;
 };
 }
 #endif /*__ARM_COMPUTE_CLLAPLACIANRECONSTRUCT_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLMagnitude.h b/arm_compute/runtime/CL/functions/CLMagnitude.h
index dc5f9139b..f9c7e5c14 100644
--- a/arm_compute/runtime/CL/functions/CLMagnitude.h
+++ b/arm_compute/runtime/CL/functions/CLMagnitude.h
@@ -41,8 +41,9 @@ public:
      * @param[in]  input2   Second tensor input. Data types supported: S16.
      * @param[out] output   Output tensor. Data types supported: S16.
      * @param[in]  mag_type (Optional) Magnitude calculation type. Default: L2NORM.
+     * @param[in]  use_fp16 (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
      */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type = MagnitudeType::L2NORM);
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type = MagnitudeType::L2NORM, bool use_fp16 = false);
 };
 }
 #endif /*__ARM_COMPUTE_CLMAGNITUDE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
index 0818cec2e..93925778d 100644
--- a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
@@ -37,7 +37,7 @@ namespace arm_compute
 {
 class ICLTensor;
 
-/** Basic function to simulate a normalization layer. This function calls the following CL kernels:
+/** Basic function to compute a normalization layer. This function calls the following CL kernels:
  *
  * -# @ref CLFillBorderKernel
  * -# @ref CLNormalizationLayerKernel
@@ -51,11 +51,21 @@ public:
     /** Set the input and output tensors.
      *
      * @param[in, out] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                           and an optional 4th dimension for batch of inputs. Data types supported: F16/F32 (Written to by the border handler)
+     *                           and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/F16/F32 (Written to by the border handler)
      * @param[out]     output    Destination tensor. Dimensions, data type and number of channels must match the input ones.
      * @param[in]      norm_info Normalization layer information like the normalization type, normalization size and other parameters.
      */
-    void configure(ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
+    void configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayer
+     *
+     * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                      and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/F16/F32
+     * @param[in] output    Destination tensor. Dimensions, data type and number of channels must match the input ones.
+     * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
index 71754fc3f..d57bfda2c 100644
--- a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
+++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
@@ -37,15 +37,30 @@ class CLPixelWiseMultiplication : public ICLSimpleFunction
 public:
     /** Initialise the kernel's inputs, output and convertion policy.
      *
-     * @param[in]  input1          First tensor input. Data types supported: U8, S16, F16 or F32.
-     * @param[in]  input2          Second tensor input. Data types supported: U8, S16, F16 or F32.
-     * @param[out] output          Output tensor. Data types supported: U8(Only if both inputs are U8), S16, F16 or F32.
-     * @param[in]  scale           Scale to apply after multiplication. Must be positive.
+     * @param[in]  input1          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in]  input2          An input tensor. Data types supported: same as @p input1.
+     * @param[out] output          The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+     * @param[in]  scale           Scale to apply after multiplication.
+     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
      * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
      * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      */
     void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
                    ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplication
+     *
+     * @param[in] input1          An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
+     * @param[in] output          The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16).
+     * @param[in] scale           Scale to apply after multiplication.
+     *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1.
+     * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+     * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
+                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
 };
 }
 #endif /*__ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
index 80233d400..a8bdabad9 100644
--- a/arm_compute/runtime/CL/functions/CLPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
@@ -42,11 +43,20 @@ class CLPoolingLayer : public ICLSimpleFunction
 public:
     /** Set the input and output tensors.
      *
-     * @param[in,out] input     Source tensor. (Written to only when padding != 0) Data types supported: QS8/QS16/F16/F32.
+     * @param[in,out] input     Source tensor. (Written to only when padding != 0) Data types supported: QS8/QASYMM8/QS16/F16/F32.
      * @param[out]    output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]     pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
     void configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayer
+     *
+     * @param[in] input     Source tensor info. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
+     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLPOOLINGLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLReshapeLayer.h b/arm_compute/runtime/CL/functions/CLReshapeLayer.h
index c82e646e9..411e75129 100644
--- a/arm_compute/runtime/CL/functions/CLReshapeLayer.h
+++ b/arm_compute/runtime/CL/functions/CLReshapeLayer.h
@@ -36,7 +36,7 @@ class CLReshapeLayer : public ICLSimpleFunction
 public:
     /** Initialise the kernel's inputs and outputs
      *
-     * @param[in]  input  First tensor input. Data type supported: U8/S8/QS8/U16/S16/QS16/U32/S32/F16/F32
+     * @param[in]  input  First tensor input. Data type supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
     void configure(const ICLTensor *input, ICLTensor *output);
diff --git a/arm_compute/runtime/CL/functions/CLScale.h b/arm_compute/runtime/CL/functions/CLScale.h
index db491c1a4..68d64a9e2 100644
--- a/arm_compute/runtime/CL/functions/CLScale.h
+++ b/arm_compute/runtime/CL/functions/CLScale.h
@@ -45,8 +45,10 @@ public:
      * @param[in]     policy                The interpolation type.
      * @param[in]     border_mode           Strategy to use for borders.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     * @param[in]     sampling_policy       (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
      */
-    void configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue());
+    void configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(),
+                   SamplingPolicy sampling_policy = SamplingPolicy::CENTER);
 };
 }
 #endif /*__ARM_COMPUTE_CLSCALE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index 70a265c1a..5430f9c10 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -39,7 +39,7 @@ class ICLTensor;
 /** Basic function to compute a SoftmaxLayer.
  *
  * Softmax is calculated by :
- * @f[ out = exp(x - max(x)) / sum(exp(x - max(x))) @f]
+ * @f[ out = exp((x - max(x)) * beta) / sum(exp((x - max(x)) * beta)) @f]
  *
  * This function runs the following kernels:
  * -# @ref CLLogits1DMaxKernel
@@ -53,22 +53,33 @@ public:
     CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in]  input  Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32
      * @param[out] output Destination tensor. Data types supported: same as @p input
+     * @param[in]  beta   (Optional) A scaling factor for the exponent. Defaults to 1.f
      */
-    void configure(const ICLTensor *input, ICLTensor *output);
+    void configure(const ICLTensor *input, ICLTensor *output, float beta = 1.0f);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLSoftmaxLayer
+     *
+     * @param[in] input  Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32
+     * @param[in] output Destination tensor. Data types supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    CLMemoryGroup               _memory_group;
-    CLLogits1DMaxKernel         _max_kernel;
-    CLLogits1DShiftExpSumKernel _shift_exp_sum_kernel;
-    CLLogits1DNormKernel        _norm_kernel;
-    CLTensor                    _max;
-    CLTensor                    _sum;
-    CLTensor                    _tmp;
+    CLMemoryGroup                  _memory_group;
+    CLLogits1DMaxKernel            _max_kernel;
+    CLLogits1DShiftExpSumKernel    _shift_exp_sum_kernel;
+    CLLogits1DMaxShiftExpSumKernel _max_shift_exp_sum_kernel;
+    CLLogits1DNormKernel           _norm_kernel;
+    CLTensor                       _max;
+    CLTensor                       _sum;
+    CLTensor                       _tmp;
+    bool                           _run_legacy_path;
 };
 }
 #endif /* __ARM_COMPUTE_CLSOFTMAXLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLTranspose.h b/arm_compute/runtime/CL/functions/CLTranspose.h
index 9ac5458a9..89a2022e7 100644
--- a/arm_compute/runtime/CL/functions/CLTranspose.h
+++ b/arm_compute/runtime/CL/functions/CLTranspose.h
@@ -40,10 +40,18 @@ class CLTranspose : public ICLSimpleFunction
 public:
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
     void configure(const ICLTensor *input, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLTranspose
+     *
+     * @param[in] input  The input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in] output The output tensor. Data types supported: Same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 };
 }
 
diff --git a/arm_compute/runtime/CPP/CPPFunctions.h b/arm_compute/runtime/CPP/CPPFunctions.h
new file mode 100644
index 000000000..1f01ffac8
--- /dev/null
+++ b/arm_compute/runtime/CPP/CPPFunctions.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPFUNCTIONS_H__
+#define __ARM_COMPUTE_CPPFUNCTIONS_H__
+
+/* Header regrouping all the CPP functions */
+#include "arm_compute/runtime/CPP/functions/CPPPermute.h"
+
+#endif /* __ARM_COMPUTE_CPPFUNCTIONS_H__ */
diff --git a/arm_compute/runtime/CPP/ICPPSimpleFunction.h b/arm_compute/runtime/CPP/ICPPSimpleFunction.h
new file mode 100644
index 000000000..d1bd23258
--- /dev/null
+++ b/arm_compute/runtime/CPP/ICPPSimpleFunction.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICPPSIMPLEFUNCTION_H__
+#define __ARM_COMPUTE_ICPPSIMPLEFUNCTION_H__
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic interface for functions which have a single CPP kernel */
+class ICPPSimpleFunction : public IFunction
+{
+public:
+    /** Constructor */
+    ICPPSimpleFunction();
+
+    // Inherited methods overridden:
+    void run() override final;
+
+protected:
+    std::unique_ptr<ICPPKernel> _kernel; /**< Kernel to run */
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_ICPPSIMPLEFUNCTION_H__ */
diff --git a/arm_compute/runtime/CPP/functions/CPPPermute.h b/arm_compute/runtime/CPP/functions/CPPPermute.h
new file mode 100644
index 000000000..0094576da
--- /dev/null
+++ b/arm_compute/runtime/CPP/functions/CPPPermute.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPPERMUTE_H__
+#define __ARM_COMPUTE_CPPPERMUTE_H__
+
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref CPPPermuteKernel */
+class CPPPermute : public ICPPSimpleFunction
+{
+public:
+    /** Configure the permute CPP kernel
+     *
+     * @param[in]  input  The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[out] output The output tensor. Data types supported: Same as @p input
+     * @param[in]  perm   Permutation vector
+     */
+    void configure(const ITensor *input, ITensor *output, const PermutationVector &perm);
+    /** Static function to check if given info will lead to a valid configuration of @ref CPPPermute
+     *
+     * @param[in] input  The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] output The output tensor. Data types supported: Same as @p input
+     * @param[in] perm   Permutation vector
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm);
+};
+}
+#endif /* __ARM_COMPUTE_CPPPERMUTE_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h b/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h
new file mode 100644
index 000000000..e76d4efb2
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCFUNCTIONS_H__
+#define __ARM_COMPUTE_GCFUNCTIONS_H__
+
+/* Header regrouping all the GLES compute functions */
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h"
+
+#endif /* __ARM_COMPUTE_GCFUNCTIONS_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/GCScheduler.h b/arm_compute/runtime/GLES_COMPUTE/GCScheduler.h
new file mode 100644
index 000000000..817f8b54b
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/GCScheduler.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_GCSCHEDULER_H__
+#define __ARM_COMPUTE_GCSCHEDULER_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class IGCKernel;
+
+/** Provides global access to a OpenGL ES context and command queue. */
+class GCScheduler
+{
+private:
+    /** Constructor */
+    GCScheduler();
+
+public:
+    /** Access the scheduler singleton.
+     *
+     * @return The scheduler
+     */
+    static GCScheduler &get();
+
+    /** Initialises the context and command queue used by the scheduler to default values
+     *  and sets a default device and kernel path for the @ref GCKernelLibrary.
+     */
+    void default_init();
+
+    /** Schedule the execution of the passed kernel if possible.
+     *
+     * @param[in] kernel Kernel to execute.
+     * @param[in] flush  (Optional) Specifies if the command queue will be flushed after running the kernel.
+     */
+    void enqueue(IGCKernel &kernel, bool flush = true);
+
+    /** Initialises the display and context to be used by the scheduler.
+     *
+     * @param[in] dpy The EGL display connection
+     * @param[in] ctx The EGL rendering context
+     */
+    void init(EGLDisplay dpy, EGLContext ctx);
+
+    /** Blocks until all commands in the associated command queue have finished. */
+    void sync();
+};
+}
+
+#endif /* __ARM_COMPUTE_GCSCHEDULER_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/GCTensor.h b/arm_compute/runtime/GLES_COMPUTE/GCTensor.h
new file mode 100644
index 000000000..3e51f9908
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/GCTensor.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_GCTENSOR_H__
+#define __ARM_COMPUTE_GCTENSOR_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h"
+
+namespace arm_compute
+{
+class ITensorAllocator;
+class ITensorInfo;
+
+/** Interface for OpenGL ES tensor */
+class GCTensor : public IGCTensor
+{
+public:
+    /** Default constructor */
+    GCTensor();
+
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    GCTensor(const GCTensor &) = delete;
+
+    /** Prevent instances of this class from being copy assigned (As this class contains pointers). */
+    GCTensor &operator=(const GCTensor &) = delete;
+
+    /** Allow instances of this class to be moved */
+    GCTensor(GCTensor &&) = default;
+
+    /** Allow instances of this class to be moved */
+    GCTensor &operator=(GCTensor &&) = default;
+
+    /** Virtual destructor */
+    virtual ~GCTensor() = default;
+
+    /** Return a pointer to the tensor's allocator
+     *
+     * @return A pointer to the tensor's allocator
+     */
+    ITensorAllocator *allocator();
+
+    /** Enqueue a map operation of the allocated buffer on the given queue.
+     *
+     * @param[in] blocking (Optional) If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     *
+     * @return The mapping address.
+     */
+    void map(bool blocking = true);
+
+    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     */
+    void unmap();
+
+    // Inherited methods overridden:
+    TensorInfo *info() const override;
+    TensorInfo *info() override;
+    uint8_t    *buffer() const override;
+    GLuint      gc_buffer() const override;
+
+protected:
+    // Inherited methods overridden:
+    uint8_t *do_map(bool blocking) override;
+    void do_unmap() override;
+
+private:
+    mutable GCTensorAllocator _allocator;
+};
+
+using GCImage = GCTensor;
+}
+
+#endif /*__ARM_COMPUTE_GCTENSOR_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h b/arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h
new file mode 100644
index 000000000..ce52cbbbd
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_GCTENSORALLOCATOR_H__
+#define __ARM_COMPUTE_GCTENSORALLOCATOR_H__
+
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic implementation of a GLES memory tensor allocator. */
+class GCTensorAllocator : public ITensorAllocator
+{
+public:
+    /** Default constructor. */
+    GCTensorAllocator();
+
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    GCTensorAllocator(const GCTensorAllocator &) = delete;
+
+    /** Prevent instances of this class from being copy assigned (As this class contains pointers). */
+    GCTensorAllocator &operator=(const GCTensorAllocator &) = delete;
+
+    /** Allow instances of this class to be moved */
+    GCTensorAllocator(GCTensorAllocator &&) = default;
+
+    /** Allow instances of this class to be moved */
+    GCTensorAllocator &operator=(GCTensorAllocator &&) = default;
+
+    /** Default destructor */
+    ~GCTensorAllocator() = default;
+
+    /** Interface to be implemented by the child class to return the pointer to the mapped data. */
+    uint8_t *data();
+
+    /** Get the OpenGL ES buffer object name
+     *
+     * @return The buffer object name
+     */
+    GLuint get_gl_ssbo_name() const;
+
+    /** Enqueue a map operation of the allocated buffer on the given queue.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     *
+     * @return The mapping address.
+     */
+    uint8_t *map(bool blocking);
+
+    /** Enqueue an unmap operation of the allocated buffer on the given queue.
+     *
+     * @note This method simply enqueue the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     */
+    void unmap();
+
+    /** Allocate size specified by TensorInfo of GLES memory.
+     *
+     * @note: The tensor must not already be allocated when calling this function.
+     *
+     */
+    void allocate() override;
+
+    /** Free allocated GLES memory.
+     *
+     * @note The tensor must have been allocated when calling this function.
+     *
+     */
+    void free() override;
+
+protected:
+    /** Call map() on the SSBO.
+     *
+     * @return A pointer to the beginning of the tensor's allocation.
+     */
+    uint8_t *lock() override;
+
+    /** Call unmap() on the SSBO. */
+    void unlock() override;
+
+private:
+    class GLBufferWrapper
+    {
+    public:
+        GLBufferWrapper()
+            : _ssbo_name(0)
+        {
+            ARM_COMPUTE_GL_CHECK(glGenBuffers(1, &_ssbo_name));
+        }
+        ~GLBufferWrapper()
+        {
+            ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, &_ssbo_name));
+        }
+        GLuint _ssbo_name;
+    };
+    std::unique_ptr<GLBufferWrapper> _gl_buffer;
+    uint8_t                         *_mapping;
+};
+}
+
+#endif /* __ARM_COMPUTE_GCTENSORALLOCATOR_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h b/arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h
new file mode 100644
index 000000000..15bbfffe9
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IGCSIMPLEFUNCTION_H__
+#define __ARM_COMPUTE_IGCSIMPLEFUNCTION_H__
+
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic interface for functions which have a single OpenGL ES kernel */
+class IGCSimpleFunction : public IFunction
+{
+public:
+    /** Default constructor */
+    IGCSimpleFunction();
+
+    // Inherited methods overridden:
+    void run() override final;
+
+protected:
+    std::unique_ptr<IGCKernel> _kernel;         /**< Kernel to run */
+    GCFillBorderKernel         _border_handler; /**< Kernel to handle  borders */
+};
+}
+#endif /*__ARM_COMPUTE_IGCSIMPLEFUNCTION_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h
new file mode 100644
index 000000000..0d4a354e2
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_GCABSOLUTEDIFFERENCE_H__
+#define __ARM_COMPUTE_GCABSOLUTEDIFFERENCE_H__
+
+#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref GCAbsoluteDifferenceKernel
+ *
+ * @note The tensor data types for the inputs must be U8.
+ * @note The function calculates the absolute difference also when the 2 inputs have different tensor data types.
+ */
+class GCAbsoluteDifference : public IGCSimpleFunction
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input1 First input tensor. Data types supported: U8
+     * @param[in]  input2 Second input tensor. Data types supported: U8
+     * @param[out] output Output tensor. Data types supported: U8
+     */
+    void configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output);
+};
+}
+
+#endif /* __ARM_COMPUTE_GCABSOLUTEDIFFERENCE_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h
new file mode 100644
index 000000000..b43456b2c
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCACTIVATIONLAYER_H__
+#define __ARM_COMPUTE_GCACTIVATIONLAYER_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Basic function to run @ref GCActivationLayerKernel
+ *
+ * @note The function simulates an activation layer with the specified activation function.
+ */
+class GCActivationLayer : public IGCSimpleFunction
+{
+public:
+    /** Set the input and output tensor.
+     *
+     * @note If the output tensor is a nullptr, the activation function will be performed in-place
+     *
+     * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
+     *                          of the activation function. Data types supported: F16/F32.
+     * @param[out]     output   Destination tensor. Data type supported: same as @p input
+     * @param[in]      act_info Activation layer parameters.
+     */
+    void configure(IGCTensor *input, IGCTensor *output, ActivationLayerInfo act_info);
+};
+}
+#endif /* __ARM_COMPUTE_GCACTIVATIONLAYER_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h
new file mode 100644
index 000000000..9d81b9a7f
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCBATCHNORMALIZATIONLAYER_H__
+#define __ARM_COMPUTE_GCBATCHNORMALIZATIONLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Basic function to run @ref GCBatchNormalizationLayerKernel and simulate a batch normalization layer.
+ *
+ * Batch normalization is calculated by:
+ * @f[ out_i = \gamma * (\frac{in_i - \mu_{B}}{\sqrt{\sigma^2_{B} + \epsilon}}) + \beta \equiv BN_{\gamma,\beta}(in_i) @f]
+ *
+ */
+class GCBatchNormalizationLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    GCBatchNormalizationLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                     The rest are optional and used for representing batches. Data types supported: F16/F32.
+     * @param[out] output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     * @param[in]  mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  epsilon Small value to avoid division with zero.
+     */
+    void configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma, float epsilon);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    GCBatchNormalizationLayerKernel _norm_kernel; /**< BatchNormalization layer kernel to run */
+};
+}
+#endif /* __ARM_COMPUTE_GCBATCHNORMALIZATIONLAYER_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h
new file mode 100644
index 000000000..1151399f9
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCDEPTHCONCATENATE_H__
+#define __ARM_COMPUTE_GCDEPTHCONCATENATE_H__
+
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Basic function to execute concatenate tensors along z axis. This function calls the following kernels:
+ *
+ * -# @ref GCFillBorderKernel (executed if input's lowest two dimensions are smaller than respective output's dimensions)
+ * -# @ref GCDepthConcatenateLayerKernel
+ *
+ */
+class GCDepthConcatenateLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    GCDepthConcatenateLayer();
+    /** Initialise the kernel's inputs vector and output.
+     *
+     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: F16/F32.
+     * @param[out]    output        Output tensor. Data types supported: Same as @p input.
+     */
+    void configure(std::vector<IGCTensor *> inputs_vector, IGCTensor *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<GCDepthConcatenateLayerKernel[]> _concat_kernels_vector;
+    std::unique_ptr<GCFillBorderKernel[]>            _border_handlers_vector;
+    unsigned int                                     _num_inputs;
+};
+}
+#endif /* __ARM_COMPUTE_GCDEPTHCONCATENATE_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h
new file mode 100644
index 000000000..5472bdb9e
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYER_H__
+#define __ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYER_H__
+
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Basic function to execute direct convolution function:
+ *
+ * @note Supported kernel size: 1x1, 3x3, and 5x5
+ * @note This OpenGL ES implementation works with stride_x = 1 and 2
+ */
+class GCDirectConvolutionLayer : public IGCSimpleFunction
+{
+public:
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs.
+     *                       Data types supported: F16/F32.
+     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
+     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                       Data types supported: Same as @p input.
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info);
+};
+}
+#endif /* __ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h
new file mode 100644
index 000000000..c51d2c161
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_GCDROPOUTLAYER_H__
+#define __ARM_COMPUTE_GCDROPOUTLAYER_H__
+
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+/** Basic function to do dropout op. This function calls the following kernels:
+ *
+ *  -# @ref GCDropoutLayerKernel
+ */
+class GCDropoutLayer : public IFunction
+{
+public:
+    /** Constructor */
+    GCDropoutLayer();
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. Data type supported: F16/F32.
+     * @param[out] mask    Destination tensor. Data type supported: Same as @p input.
+     * @param[out] output  Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  ratio   Dropout ratio
+     * @param[in]  forward Forward or backward propagation
+     *
+     */
+    void configure(const IGCTensor *input, IGCTensor *mask, IGCTensor *output, float ratio, bool forward);
+
+    //Inherited methods override
+    void run() override;
+
+private:
+    GCDropoutLayerKernel _dropout_kernel;
+};
+}
+
+#endif /* __ARM_COMPUTE_GCDROPOUTLAYER_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h
new file mode 100644
index 000000000..a04e4002f
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_GCFILLBORDER_H__
+#define __ARM_COMPUTE_GCFILLBORDER_H__
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref GCFillBorderKernel */
+class GCFillBorder : public IGCSimpleFunction
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in,out] tensor                Source tensor. Data types supported: F16/F32
+     * @param[in]     border_width          The border width
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(IGCTensor *tensor, unsigned int border_width, BorderMode border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
+};
+}
+
+#endif /*__ARM_COMPUTE_FILLBORDER_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h
new file mode 100644
index 000000000..1ae5837de
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCFULLYCONNECTEDLAYER_H__
+#define __ARM_COMPUTE_GCFULLYCONNECTEDLAYER_H__
+
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
+
+namespace arm_compute
+{
+/** Basic function to reshape the weights of Fully Connected layer with OpenGL ES. This function calls the following kernels:
+ *
+ *  -# @ref GCTransposeKernel
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class GCFullyConnectedLayerReshapeWeights : public IGCSimpleFunction
+{
+public:
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported: F16/F32.
+     * @param[out] output Destination tensor which stores the transposed input tensor. Data type supported: Same as @p input.
+     */
+    void configure(const IGCTensor *input, IGCTensor *output);
+};
+
+/** Basic function to compute a Fully Connected layer on OpenGL ES. This function calls the following OpenGL ES kernels:
+ *
+ *  -# @ref GCIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref GCFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
+ *  -# @ref GCGEMMMatrixMultiplyKernel
+ *  -# @ref GCGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class GCFullyConnectedLayer : public IFunction
+{
+public:
+    /** Constructor */
+    GCFullyConnectedLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input                Source tensor. Data type supported: F16/F32.
+     * @param[in]  weights              Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input
+     * @param[in]  biases               Bias tensor. It can be nullptr. Data type supported:Same as @p input.
+     * @param[out] output               Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  transpose_weights    (Optional) Transpose weights if true. Defaults to true.
+     * @param[in]  are_weights_reshaped (Optional) Reshape the weights tensor if false. Defaults to false.
+     */
+    void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, bool transpose_weights = true, bool are_weights_reshaped = false);
+
+    //Inherited methods override
+    void run() override;
+
+private:
+    void configure_fc_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output);
+    void configure_conv_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output);
+
+    GCIm2ColKernel                      _im2col_kernel;
+    GCFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
+    GCGEMMMatrixMultiplyKernel          _mm_kernel;
+    GCGEMMMatrixAccumulateBiasesKernel  _accumulate_biases_kernel;
+    GCTensor                            _im2col_output;
+    GCTensor                            _reshape_weights_output;
+    bool                                _are_weights_reshaped;
+    bool                                _is_fc_after_conv;
+    bool                                _accumulate_biases;
+};
+}
+#endif /* __ARM_COMPUTE_GCFULLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h
new file mode 100644
index 000000000..f2484cd80
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_GCGEMM_H__
+#define __ARM_COMPUTE_GCGEMM_H__
+
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Basic function to execute GEMM on OpenGLES Compute. This function calls the following kernels:
+ *
+ *  -# @ref GCGEMMInterleave4x4Kernel (if the output tensor is a matrix)
+ *  -# @ref GCGEMMTranspose1xWKernel (if the output tensor is a matrix)
+ *  -# @ref GCGEMMMatrixMultiplyKernel
+ *  -# @ref GCGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0)
+ *
+ */
+class GCGEMM : public IFunction
+{
+public:
+    /** Default constructor. */
+    GCGEMM();
+
+    /** Initialise the kernel's inputs and output
+     *
+     * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
+     *
+     * @note All tensors must have the same data type.
+     *
+     * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix
+     *
+     * @param[in]  a      First input tensor  (Matrix or Vector A). Data types supported: F32
+     * @param[in]  b      Second input tensor (Matrix B). Data type supported: same as @p a.
+     * @param[in]  c      Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
+     * @param[out] output Output tensor. Data type supported: same as @p a
+     * @param[in]  alpha  Weight of the matrix product
+     * @param[in]  beta   Weight of matrix C
+     */
+    void configure(const IGCTensor *a, const IGCTensor *b, const IGCTensor *c, IGCTensor *output, float alpha, float beta);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    GCGEMMInterleave4x4Kernel  _interleave_kernel;
+    GCGEMMTranspose1xWKernel   _transpose_kernel;
+    GCGEMMMatrixMultiplyKernel _mm_kernel;
+    GCGEMMMatrixAdditionKernel _ma_kernel;
+    GCTensor                   _tmp_a;
+    GCTensor                   _tmp_b;
+    bool                       _is_interleaved_transposed;
+    bool                       _run_addition;
+};
+}
+
+#endif /* __ARM_COMPUTE_GCGEMM_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h
new file mode 100644
index 000000000..48fa7ed50
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCGEMMINTERLEAVE4X4_H__
+#define __ARM_COMPUTE_GCGEMMINTERLEAVE4X4_H__
+
+#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute GCGEMMInterleave4x4Kernel. This function calls the following OpenGL ES kernel:
+ *
+ *  -# @ref GCGEMMInterleave4x4Kernel
+ *
+ */
+class GCGEMMInterleave4x4 : public IGCSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output
+     *
+     * @param[in]  input  First input tensor. Data types supported: F32
+     * @param[out] output Output tensor. Data type supported: same as @p input
+     */
+    void configure(const IGCTensor *input, IGCTensor *output);
+};
+}
+
+#endif /* __ARM_COMPUTE_GCGEMMINTERLEAVE4X4_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h
new file mode 100644
index 000000000..24af2193c
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCGEMMTRANSPOSE1XW_H__
+#define __ARM_COMPUTE_GCGEMMTRANSPOSE1XW_H__
+
+#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
+
+namespace arm_compute
+{
+/** Basic function to execute GCGEMMTranspose1xWKernel. This function calls the following OpenGLES kernels:
+ *
+ *  -# @ref GCGEMMTranspose1xWKernel
+ *
+ */
+class GCGEMMTranspose1xW : public IGCSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output
+     *
+     * @param[in]  input  First input tensor. Data type supported: F32
+     * @param[out] output Output tensor. Data type supported: same as @p input
+     */
+    void configure(const IGCTensor *input, IGCTensor *output);
+};
+}
+#endif /*__ARM_COMPUTE_GCGEMMTRANSPOSE1XW_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h
new file mode 100644
index 000000000..d080a2f7b
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCNORMALIZATIONLAYER_H__
+#define __ARM_COMPUTE_GCNORMALIZATIONLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Basic function to compute a normalization layer. This function calls the following OpenGL ES kernels:
+ *
+ * -# @ref GCPixelWiseMultiplicationKernel
+ * -# @ref GCFillBorderKernel
+ * -# @ref GCNormalizationLayerKernel
+ *
+ */
+class GCNormalizationLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    GCNormalizationLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                       and an optional 4th dimension for batch of inputs. Data types supported: F32. Number of channels must be 1.
+     * @param[out] output    Destination tensor. Dimensions, data type and number of channels must match the input ones.
+     * @param[in]  norm_info Normalization layer information like the normalization type, normalization size and other parameters.
+     */
+    void configure(const IGCTensor *input, IGCTensor *output, const NormalizationLayerInfo &norm_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    GCTensor                        _squared_input;   /**< The intermediate buffer which stores results of squaring input*/
+    GCNormalizationLayerKernel      _norm_kernel;     /**< Normalization layer kernel to run */
+    GCPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel to run */
+    GCFillBorderKernel              _border_handler;  /**< Kernel to handle  borders */
+};
+}
+#endif /* __ARM_COMPUTE_GCNORMALIZATIONLAYER_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h
new file mode 100644
index 000000000..e6239edc2
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCPIXELWISEMULTIPLICATION_H__
+#define __ARM_COMPUTE_GCPIXELWISEMULTIPLICATION_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Basic function to run @ref GCPixelWiseMultiplicationKernel. */
+class GCPixelWiseMultiplication : public IGCSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and convertion policy.
+     *
+     * @param[in]  input1 First tensor input. Data types supported: F32.
+     * @param[in]  input2 Second tensor input. Data types supported: Same as @p input1.
+     * @param[out] output Output tensor. Data types supported: Same as @p input1.
+     * @param[in]  scale  Scale to apply after multiplication. Must be a positive value.
+     */
+    void configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, float scale);
+};
+}
+#endif /*__ARM_COMPUTE_GCPIXELWISEMULTIPLICATION_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h
new file mode 100644
index 000000000..cce44d0c3
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCPOOLINGLAYER_H__
+#define __ARM_COMPUTE_GCPOOLINGLAYER_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenGL ES kernels:
+ *
+ * -# @ref GCFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref GCPoolingLayerKernel
+ */
+class GCPoolingLayer : public IGCSimpleFunction
+{
+public:
+    /** Set the input and output tensors.
+     *
+     * @param[in,out] input     Source tensor. (Written to only when padding != 0) Data types supported: F16/F32.
+     * @param[out]    output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]     pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GCPOOLINGLAYER_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h
new file mode 100644
index 000000000..e7f8d5053
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCSOFTMAXLAYER_H__
+#define __ARM_COMPUTE_GCSOFTMAXLAYER_H__
+
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Basic function to compute a SoftmaxLayer.
+ *
+ * Softmax is calculated by :
+ * @f[ out = exp(x - max(x)) / sum(exp(x - max(x))) @f]
+ *
+ * This function runs the following kernels:
+ * -# @ref GCLogits1DMaxKernel
+ * -# @ref GCLogits1DShiftExpSumKernel
+ * -# @ref GCLogits1DNormKernel
+ */
+class GCSoftmaxLayer : public IFunction
+{
+public:
+    /** Constructor */
+    GCSoftmaxLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: F16/F32
+     * @param[out] output Destination tensor. Data types supported: same as @p input
+     * @param[in]  beta   (Optional) A scaling factor for the exponent. Only beta = 1 is supported.
+     */
+    void configure(const IGCTensor *input, IGCTensor *output, float beta = 1.0f);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    GCLogits1DMaxKernel         _max_kernel;
+    GCLogits1DShiftExpSumKernel _shift_exp_sum_kernel;
+    GCLogits1DNormKernel        _norm_kernel;
+    GCTensor                    _max;
+    GCTensor                    _sum;
+    GCTensor                    _tmp;
+};
+}
+#endif /* __ARM_COMPUTE_GCSOFTMAXLAYER_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h
new file mode 100644
index 000000000..23324343f
--- /dev/null
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GCTRANSPOSE_H__
+#define __ARM_COMPUTE_GCTRANSPOSE_H__
+
+#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
+
+namespace arm_compute
+{
+class IGCTensor;
+
+/** Basic function to transpose a matrix on OpenGL ES. This function calls the following OpenGL ES kernel:
+ *
+ *  -# @ref GCTransposeKernel
+ *
+ */
+class GCTranspose : public IGCSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input  Input tensor. Data types supported: F16/F32
+     * @param[out] output Output tensor. Data type supported: Same as @p input
+     */
+    void configure(const IGCTensor *input, IGCTensor *output);
+};
+}
+
+#endif /* __ARM_COMPUTE_GCTRANSPOSE_H__ */
diff --git a/arm_compute/runtime/ILifetimeManager.h b/arm_compute/runtime/ILifetimeManager.h
index 4f9af6f53..6f2c68d37 100644
--- a/arm_compute/runtime/ILifetimeManager.h
+++ b/arm_compute/runtime/ILifetimeManager.h
@@ -28,7 +28,7 @@
 #include "arm_compute/runtime/Types.h"
 
 #include <cstddef>
-#include <vector>
+#include <memory>
 
 namespace arm_compute
 {
@@ -58,6 +58,11 @@ public:
      * @param[in] size   Size of the given object at given time
      */
     virtual void end_lifetime(void *obj, void **handle, size_t size) = 0;
+    /** Checks if the lifetime of the registered object is complete
+     *
+     * @return True if all object lifetimes are finalized else false.
+     */
+    virtual bool are_all_finalized() const = 0;
     /** Creates a memory pool depending on the memory requirements
      *
      * @param allocator Allocator to use
@@ -65,16 +70,11 @@ public:
      * @return A memory pool
      */
     virtual std::unique_ptr<IMemoryPool> create_pool(IAllocator *allocator) = 0;
-    /** Checks if the lifetime of the registered object is complete
-     *
-     * @return True if all object lifetimes are finalized else false.
-     */
-    virtual bool are_all_finalized() const = 0;
     /** Returns the type of mappings that the lifetime manager returns
      *
      * @return Mapping type of the lifetime manager
      */
     virtual MappingType mapping_type() const = 0;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_ILIFETIMEMANAGER_H__ */
diff --git a/arm_compute/runtime/ISimpleLifetimeManager.h b/arm_compute/runtime/ISimpleLifetimeManager.h
new file mode 100644
index 000000000..792ab0b55
--- /dev/null
+++ b/arm_compute/runtime/ISimpleLifetimeManager.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ISIMPLELIFETIMEMANAGER_H__
+#define __ARM_COMPUTE_ISIMPLELIFETIMEMANAGER_H__
+
+#include "arm_compute/runtime/ILifetimeManager.h"
+
+#include "arm_compute/runtime/IMemoryPool.h"
+#include "arm_compute/runtime/Types.h"
+
+#include <cstddef>
+#include <map>
+#include <vector>
+
+namespace arm_compute
+{
+class IAllocator;
+class IMemoryGroup;
+
+/** Abstract class of the simple lifetime manager interface */
+class ISimpleLifetimeManager : public ILifetimeManager
+{
+public:
+    /** Constructor */
+    ISimpleLifetimeManager();
+    /** Prevent instances of this class to be copy constructed */
+    ISimpleLifetimeManager(const ISimpleLifetimeManager &) = delete;
+    /** Prevent instances of this class to be copied */
+    ISimpleLifetimeManager &operator=(const ISimpleLifetimeManager &) = delete;
+    /** Allow instances of this class to be move constructed */
+    ISimpleLifetimeManager(ISimpleLifetimeManager &&) = default;
+    /** Allow instances of this class to be moved */
+    ISimpleLifetimeManager &operator=(ISimpleLifetimeManager &&) = default;
+
+    // Inherited methods overridden:
+    void register_group(IMemoryGroup *group) override;
+    void start_lifetime(void *obj) override;
+    void end_lifetime(void *obj, void **handle, size_t size) override;
+    bool are_all_finalized() const override;
+
+protected:
+    /** Update blobs and mappings */
+    virtual void update_blobs_and_mappings() = 0;
+
+protected:
+    /** Element struct */
+    struct Element
+    {
+        Element(void *id_ = nullptr, void **handle_ = nullptr, size_t size_ = 0, bool status_ = false)
+            : id(id_), handle(handle_), size(size_), status(status_)
+        {
+        }
+        void *id;      /**< Element id */
+        void **handle; /**< Element's memory handle */
+        size_t size;   /**< Element's size */
+        bool   status; /**< Lifetime status */
+    };
+
+    IMemoryGroup        *_active_group;                               /**< Active group */
+    std::vector<Element> _active_elements;                            /**< A map that contains the active elements */
+    std::map<IMemoryGroup *, std::vector<Element>> _finalized_groups; /**< A map that contains the finalized groups */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_ISIMPLELIFETIMEMANAGER_H__ */
diff --git a/arm_compute/runtime/Memory.h b/arm_compute/runtime/Memory.h
new file mode 100644
index 000000000..98bbb7023
--- /dev/null
+++ b/arm_compute/runtime/Memory.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_MEMORY_H__
+#define __ARM_COMPUTE_MEMORY_H__
+
+#include <cstddef>
+#include <memory>
+
+namespace arm_compute
+{
+/** CPU implementation of memory object */
+class Memory
+{
+public:
+    /** Default Constructor */
+    Memory();
+    /** Default Constructor
+     *
+     * @note Ownership of the memory is transferred to this object
+     *
+     * @param[in] memory Memory to be imported
+     */
+    Memory(std::shared_ptr<uint8_t> memory);
+    /** Default Constructor
+     *
+     * @note Ownership of the memory is not transferred to this object.
+     *       Thus management (allocate/free) should be done by the client.
+     *
+     * @param[in] memory Memory to be imported
+     */
+    Memory(uint8_t *memory);
+    /** Allow instances of this class to be copied */
+    Memory(const Memory &) = default;
+    /** Allow instances of this class to be copy assigned */
+    Memory &operator=(const Memory &) = default;
+    /** Allow instances of this class to be moved */
+    Memory(Memory &&) noexcept = default;
+    /** Allow instances of this class to be move assigned */
+    Memory &operator=(Memory &&) noexcept = default;
+
+    /** Returns the pointer to the allocated data.
+     *
+     * @return Pointer to the allocated data
+     */
+    uint8_t *buffer();
+    /** Returns the pointer to the allocated data.
+     *
+     * @return Pointer to the allocated data
+     */
+    uint8_t *buffer() const;
+    /** Handle of internal memory
+     *
+     * @return Handle of memory
+     */
+    uint8_t **handle();
+
+private:
+    uint8_t                 *_memory;
+    std::shared_ptr<uint8_t> _memory_owned;
+};
+}
+#endif /* __ARM_COMPUTE_MEMORY_H__ */
diff --git a/arm_compute/runtime/MemoryGroupBase.h b/arm_compute/runtime/MemoryGroupBase.h
index ab8acb349..19e983492 100644
--- a/arm_compute/runtime/MemoryGroupBase.h
+++ b/arm_compute/runtime/MemoryGroupBase.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/runtime/IMemoryGroup.h"
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IMemoryPool.h"
 
diff --git a/arm_compute/runtime/MultiImage.h b/arm_compute/runtime/MultiImage.h
index 917e586ef..30fa9b025 100644
--- a/arm_compute/runtime/MultiImage.h
+++ b/arm_compute/runtime/MultiImage.h
@@ -45,18 +45,18 @@ public:
     MultiImage();
     /** Allocate the multi-planar image
      *
-     *  @param[in] width  Width of the whole image
-     *  @param[in] height Height of the whole image
-     *  @param[in] format Format of the whole image
+     * @param[in] width  Width of the whole image
+     * @param[in] height Height of the whole image
+     * @param[in] format Format of the whole image
      */
     void init(unsigned int width, unsigned int height, Format format);
     /** Allocate the multi-planar image
      *
      * @note Uses conservative padding strategy which fits all kernels.
      *
-     *  @param[in] width  Width of the whole image
-     *  @param[in] height Height of the whole image
-     *  @param[in] format Format of the whole image
+     * @param[in] width  Width of the whole image
+     * @param[in] height Height of the whole image
+     * @param[in] format Format of the whole image
      */
     void init_auto_padding(unsigned int width, unsigned int height, Format format);
     /** Allocated a previously initialised multi image
@@ -67,10 +67,10 @@ public:
     void allocate();
     /** Create a subimage from an existing MultiImage.
      *
-     *  @param[in] image  Image to use backing memory from
-     *  @param[in] coords Starting coordinates of the new image. Should be within the parent image sizes
-     *  @param[in] width  The width of the subimage
-     *  @param[in] height The height of the subimage
+     * @param[in] image  Image to use backing memory from
+     * @param[in] coords Starting coordinates of the new image. Should be within the parent image sizes
+     * @param[in] width  The width of the subimage
+     * @param[in] height The height of the subimage
      */
     void create_subimage(MultiImage *image, const Coordinates &coords, unsigned int width, unsigned int height);
 
@@ -82,10 +82,10 @@ public:
 private:
     /** Init the multi-planar image
      *
-     *  @param[in] width        Width of the whole image
-     *  @param[in] height       Height of the whole image
-     *  @param[in] format       Format of the whole image
-     *  @param[in] auto_padding Specifies whether the image uses auto padding
+     * @param[in] width        Width of the whole image
+     * @param[in] height       Height of the whole image
+     * @param[in] format       Format of the whole image
+     * @param[in] auto_padding Specifies whether the image uses auto padding
      */
     void internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding);
 
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index 40bff978a..08852cf36 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -39,11 +39,16 @@
 #include "arm_compute/runtime/NEON/functions/NECannyEdge.h"
 #include "arm_compute/runtime/NEON/functions/NEChannelCombine.h"
 #include "arm_compute/runtime/NEON/functions/NEChannelExtract.h"
+#include "arm_compute/runtime/NEON/functions/NECol2Im.h"
 #include "arm_compute/runtime/NEON/functions/NEColorConvert.h"
 #include "arm_compute/runtime/NEON/functions/NEConvolution.h"
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayerUpsample.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEDerivative.h"
 #include "arm_compute/runtime/NEON/functions/NEDilate.h"
@@ -57,7 +62,9 @@
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
@@ -68,8 +75,9 @@
 #include "arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h"
 #include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h"
 #include "arm_compute/runtime/NEON/functions/NEHistogram.h"
+#include "arm_compute/runtime/NEON/functions/NEIm2Col.h"
 #include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
-#include "arm_compute/runtime/NEON/functions/NEL2Normalize.h"
+#include "arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h"
 #include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
 #include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
 #include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
@@ -100,5 +108,6 @@
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 #include "arm_compute/runtime/NEON/functions/NEWarpAffine.h"
 #include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h"
+#include "arm_compute/runtime/NEON/functions/NEWinogradLayer.h"
 
 #endif /* __ARM_COMPUTE_NEFUNCTIONS_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
index f3cd30591..007c53a0a 100644
--- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
@@ -49,6 +49,16 @@ public:
      * @param[in]      activation_info Activation layer parameters.
      */
     void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayer
+     *
+     * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
+     *                     of the activation function. Data types supported: QS8/QS16/F16/F32.
+     * @param[in] output   Destination tensor info. Data type supported: same as @p input
+     * @param[in] act_info Activation layer information.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
 };
 }
 #endif /* __ARM_COMPUTE_NEACTIVATIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
index 3d1862389..371807393 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
@@ -43,6 +43,16 @@ public:
      * @param[in]  policy Policy to use to handle overflow.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAddition
+     *
+     * @param[in] input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32
+     * @param[in] input2 Second tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32
+     * @param[in] output Output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
+     * @param[in] policy Policy to use to handle overflow.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
 };
 }
 #endif /*__ARM_COMPUTE_NEARITHMETICADDITION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
index b59cca98a..751ed1adf 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
@@ -43,6 +43,16 @@ public:
      * @param[in]  policy Policy to use to handle overflow.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction
+     *
+     * @param[in] input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32
+     * @param[in] input2 Second tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32
+     * @param[in] output Output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
+     * @param[in] policy Policy to use to handle overflow.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
 };
 }
 #endif /* __ARM_COMPUTE_NEARITHMETICSUBTRACTION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
index 041b9e729..1933468af 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
@@ -50,14 +50,32 @@ public:
      * @param[in, out] input   Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
      *                         3 lower dimensions represent a single input with dimensions [width, height, FM].
      *                         The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32.
+     * @param[out]     output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
      * @param[in]      mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
      * @param[in]      var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]      gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
      * @param[in]      beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]      gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
      * @param[in]      epsilon Small value to avoid division with zero.
-     * @param[out]     output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
      */
     void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayer
+     *
+     * @param[in] input   Source tensor info. In case of @p output tensor = nullptr, this tensor will store the result.
+     *                    3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                    The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32.
+     * @param[in] output  Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     * @param[in] mean    Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] var     Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] beta    Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] gamma   Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in] epsilon Small value to avoid division with zero.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                           const ITensorInfo *mean, const ITensorInfo *var,
+                           const ITensorInfo *beta, const ITensorInfo *gamma,
+                           float epsilon);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEChannelExtract.h b/arm_compute/runtime/NEON/functions/NEChannelExtract.h
index 5e46eef3a..1620d3ad1 100644
--- a/arm_compute/runtime/NEON/functions/NEChannelExtract.h
+++ b/arm_compute/runtime/NEON/functions/NEChannelExtract.h
@@ -39,14 +39,14 @@ class NEChannelExtract : public INESimpleFunction
 public:
     /** Initialize the function's source, destination
      *
-     * @param[in]  input   The input tensor to extract the channel from. Formats supported: Any single planar.
+     * @param[in]  input   The input tensor to extract the channel from. Formats supported: Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
      * @param[in]  channel The channel to extract.
      * @param[out] output  The extracted channel. Format supported: U8
      */
     void configure(const ITensor *input, Channel channel, ITensor *output);
     /** Initialize the function's source, destination
      *
-     * @param[in]  input   The multi-planar input image to extract channel from.
+     * @param[in]  input   The multi-planar input image to extract channel from. Formats supported: NV12/NV21/IYUV/YUV444
      * @param[in]  channel The channel to extract.
      * @param[out] output  The extracted channel. Format supported: U8
      */
diff --git a/arm_compute/runtime/NEON/functions/NECol2Im.h b/arm_compute/runtime/NEON/functions/NECol2Im.h
new file mode 100644
index 000000000..9b05bd451
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NECol2Im.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECOL2IM_H__
+#define __ARM_COMPUTE_NECOL2IM_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NECol2Im */
+class NECol2Im : public INESimpleFunction
+{
+public:
+    /** Configure the col2im NEON kernel
+     *
+     * @param[in]  input          The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
+     *                            while the rest represent batch of outputs. Data types supported: Same as @p input
+     * @param[in]  convolved_dims Output convolved dimensions.
+     */
+    void configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims);
+    /** Static function to check if given info will lead to a valid configuration of @ref NECol2Im
+     *
+     * @param[in] input          The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
+     *                           while the rest represent batch of outputs. Data types supported: Same as @p input
+     * @param[in] convolved_dims Output convolved dimensions.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims);
+};
+}
+#endif /* __ARM_COMPUTE_NECOL2IM_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEColorConvert.h b/arm_compute/runtime/NEON/functions/NEColorConvert.h
index 2997778ed..ab0bf1460 100644
--- a/arm_compute/runtime/NEON/functions/NEColorConvert.h
+++ b/arm_compute/runtime/NEON/functions/NEColorConvert.h
@@ -38,26 +38,27 @@ class NEColorConvert : public INESimpleFunction
 public:
     /** Initialize the function's source, destination
      *
-     * @param[in] input  The input single-planar tensor from which to convert
-     * @param[in] output The converted single-planar output tensor
+     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
+     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
+     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/)
      */
     void configure(const ITensor *input, ITensor *output);
     /** Initialize the function's source, destination
      *
-     * @param[in] input  The multi-planar input image from which to convert
-     * @param[in] output The converted single-planar output image
+     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
      */
     void configure(const IMultiImage *input, IImage *output);
     /** Initialize the function's source, destination
      *
-     * @param[in] input  The single-planar input image from which to convert
-     * @param[in] output The converted multi-planar output image
+     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
+     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
      */
     void configure(const IImage *input, IMultiImage *output);
     /** Initialize the function's source, destination
      *
-     * @param[in] input  The multi-planar input image from which to convert
-     * @param[in] output The converted multi-planar output image
+     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
      */
     void configure(const IMultiImage *input, IMultiImage *output);
 };
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
new file mode 100644
index 000000000..8757bc63a
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDECONVOLUTIONLAYER_H__
+#define __ARM_COMPUTE_NEDECONVOLUTIONLAYER_H__
+
+#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayerUpsample.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Function to run the deconvolution layer.
+ *
+ *  The operation is similar to convolution but it's implemented by up-sampling the inputs with zeros insertions between the inputs and convolving
+ *  the kernels on the up-sampled result.
+ *
+ *  Before the Deconvolution is done, up-scaling the first 2D with zeros is performed. The relation between input to
+ *  output is as follows:
+ *      width_output = round((width_input − 1) ∗ upscale_x − 2 ∗ padding_x + kernel_x + a_x )
+ *      height_output = round((height_input − 1) ∗ upscale_y − 2 ∗ padding_y + kernel_y + a_y )
+ *
+ *  where
+ *      width is the size of the first input dimension.
+ *      height is the size of the second input dimension.
+ *      width_output is the size of the first output dimension.
+ *      height_output is the size of the second output dimension.
+ *      kernel_x and kernel_y are the convolution sizes in x and y.
+ *      ax and ay the number of zeros added to the top and right edges of the input.
+ *      upscale_x and upscale_y how much to scale the X and Y axis.
+ *
+ *  This function calls the following NEON kernels:
+ *
+ * -# @ref NEDeconvolutionLayerUpsampleKernel
+ * -# @ref NEDirectConvolutionLayer
+ *
+ */
+class NEDeconvolutionLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Set the input, weights, biases and output tensors.
+     *
+     * @param[in,out] input    Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: F32.
+     * @param[in]     weights  The 4d weights with dimensions [width, height, OFM, IFM]. Data type supported: Same as @p input.
+     * @param[in]     bias     Optional, ignored if NULL. The biases have one dimension. Data type supported: Same as @p input.
+     * @param[out]    output   Output tensor. The output has the same number of dimensions as the @p input.
+     * @param[in]     info     Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo.
+     * @param[in]     ax       The number of zeros added to right edge of the input.
+     * @param[in]     ay       The number of zeros added to top edge of the input.
+     * @param[in]     upscalex How much to scale the X axis.
+     * @param[in]     upscaley How much to scale the Y axis.
+     *
+     */
+    void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info,
+                   unsigned int ax, unsigned int ay, float upscalex, float upscaley);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    MemoryGroup                  _memory_group;
+    NEDeconvolutionLayerUpsample _scale_f;
+    NEDirectConvolutionLayer     _conv_f;
+    Tensor                       _scaled_output;
+};
+} // arm_compute
+#endif /* __ARM_COMPUTE_NEDECONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayerUpsample.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayerUpsample.h
new file mode 100644
index 000000000..d2ac12a58
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayerUpsample.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDECONVOLUTIONUPSAMPLE_H__
+#define __ARM_COMPUTE_NEDECONVOLUTIONUPSAMPLE_H__
+
+#include "arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEDeconvolutionLayerUpsampleKernel */
+class NEDeconvolutionLayerUpsample : public IFunction
+{
+public:
+    /** Constructor
+     *
+     * Initialize NEDeconvolutionLayerUpsample
+     */
+    NEDeconvolutionLayerUpsample(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Initialize the function's source, destination, interpolation type and border_mode.
+     *
+     * @param[in, out] input  Source tensor. Data type supported: F32.
+     * @param[out]     output Destination tensor. Data type supported: F32.
+     * @param[in]      a      Top and right inner border sizes. These rows and columns will be filled with zero.
+     * @param[in]      iz     The number of zeros to be inserted between each input sample
+     * @param[in]      info   Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo.
+     */
+    void configure(ITensor *input, ITensor *output, const std::pair<unsigned int, unsigned int> &a,
+                   const std::pair<unsigned int, unsigned int> &iz, const PadStrideInfo &info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    MemoryGroup                        _memory_group;
+    Tensor                             _offsets;
+    NEFillBorderKernel                 _border_handler;
+    NEDeconvolutionLayerUpsampleKernel _upsample;
+};
+} // arm_compute
+#endif /*__ARM_COMPUTE_NEDECONVOLUTIONUPSAMPLE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h b/arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h
index cc6509957..5b63b7063 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h
@@ -26,7 +26,7 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 
 #include <memory>
@@ -39,14 +39,14 @@ class ITensor;
 /** Basic function to execute concatenate tensors along z axis. This function calls the following kernels:
  *
  * -# @ref NEFillBorderKernel (executed if input's lowest two dimensions are smaller than respective output's dimensions)
- * -# @ref NEDepthConcatenateKernel
+ * -# @ref NEDepthConcatenateLayerKernel
  *
  */
-class NEDepthConcatenate : public IFunction
+class NEDepthConcatenateLayer : public IFunction
 {
 public:
     /** Default constructor */
-    NEDepthConcatenate();
+    NEDepthConcatenateLayer();
     /** Initialise the kernel's inputs vector and output.
      *
      * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported:  QS8/QS16/F16/F32.
@@ -58,10 +58,10 @@ public:
     void run() override;
 
 private:
-    std::vector<ITensor *>                      _inputs_vector;
-    std::unique_ptr<NEDepthConcatenateKernel[]> _concat_kernels_vector;
-    std::unique_ptr<NEFillBorderKernel[]>       _border_handlers_vector;
-    unsigned int                                _num_inputs;
+    std::vector<ITensor *>                           _inputs_vector;
+    std::unique_ptr<NEDepthConcatenateLayerKernel[]> _concat_kernels_vector;
+    std::unique_ptr<NEFillBorderKernel[]>            _border_handlers_vector;
+    unsigned int                                     _num_inputs;
 };
 }
 #endif /* __ARM_COMPUTE_NEDEPTHCONCATENATE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvert.h b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
index 37f7293fb..b235e87b4 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthConvert.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
@@ -33,16 +33,16 @@ namespace arm_compute
 {
 class ITensor;
 
-/**Basic function to run @ref NEDepthConvertKernel */
-class NEDepthConvert : public INESimpleFunction
+/**Basic function to run @ref NEDepthConvertLayerKernel */
+class NEDepthConvertLayer : public INESimpleFunction
 {
 public:
     /* Contructor */
-    NEDepthConvert() = default;
+    NEDepthConvertLayer() = default;
     /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEDepthConvert(const NEDepthConvert &) = delete;
+    NEDepthConvertLayer(const NEDepthConvertLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    const NEDepthConvert &operator=(const NEDepthConvert &) = delete;
+    const NEDepthConvertLayer &operator=(const NEDepthConvertLayer &) = delete;
     /** Initialize the function's source, destination
      *
      * Valid conversions Input -> Output :
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
new file mode 100644
index 000000000..659594fe1
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H__
+#define __ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H__
+
+#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute a depthwise convolution for kernel size 3x3xC. This function calls the following NEON kernels:
+ *
+ * -# @ref NEDepthwiseConvolutionLayer3x3
+ * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0)
+ *
+ */
+class NEDepthwiseConvolutionLayer3x3 : public IFunction
+{
+public:
+    /** Default constructor */
+    NEDepthwiseConvolutionLayer3x3();
+    /** Initialize the function's source, destination, kernels and border_size.
+     *
+     * @param[in, out] input     Source tensor. Data type supported: F32. (Written to only for border filling).
+     * @param[in]      weights   Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input.
+     * @param[in]      biases    (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                           Data type supported: Same as @p input.
+     * @param[out]     output    Destination tensor. Data type supported: same as @p input.
+     * @param[in]      conv_info Padding and stride information to use for the convolution.
+     */
+    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overriden:
+    void run() override;
+
+private:
+    NEDepthwiseConvolutionLayer3x3Kernel         _kernel;
+    NEDirectConvolutionLayerBiasAccumulateKernel _bias_kernel;
+    NEFillBorderKernel                           _border_handler;
+    bool                                         _has_bias;
+};
+
+/** Basic function to execute a generic depthwise convolution. This function calls the following NEON kernels:
+ *
+ * -# @ref NEDepthwiseIm2ColKernel
+ * -# @ref NEDepthwiseWeightsReshapeKernel
+ * -# @ref NEGEMMMatrixVectorMultiplyKernel
+ * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0)
+ *
+ */
+class NEDepthwiseConvolutionLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    NEDepthwiseConvolutionLayer();
+    /** Initialize the function's source, destination, weights and convolution information.
+     *
+     * @param[in, out] input     Source tensor. Data type supported: F32. (Written to only for border filling).
+     * @param[out]     output    Destination tensor. Data type supported: same as @p input.
+     * @param[in]      weights   Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+     * @param[in]      biases    (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                           Data type supported: Same as @p input.
+     * @param[in]      conv_info Padding and stride information to use for the convolution.
+     */
+    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overriden:
+    void run() override;
+
+private:
+    NEDepthwiseIm2ColKernel          _im2col_kernel;
+    NEDepthwiseWeightsReshapeKernel  _weights_reshape_kernel;
+    NEGEMMMatrixVectorMultiplyKernel _v2mm_kernel;
+    NEDepthwiseVectorToTensorKernel  _vector_to_tensor_kernel;
+    Tensor                           _input_reshaped;
+    Tensor                           _weights_reshaped;
+    Tensor                           _v2mm_output;
+};
+}
+#endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H__ */
+\ No newline at end of file
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h
new file mode 100644
index 000000000..0562c0751
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEON_DEPTHWISE_SEPARABLE_CONVOLUTION_H__
+#define __ARM_COMPUTE_NEON_DEPTHWISE_SEPARABLE_CONVOLUTION_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute depthwise convolution. This function calls the following NEON kernels and function:
+ *
+ * -# @ref NEDepthwiseConvolutionLayer
+ * -# @ref NEDirectConvolutionLayer
+ *
+ */
+class NEDepthwiseSeparableConvolutionLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    NEDepthwiseSeparableConvolutionLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input               Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                                 while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F32.
+     * @param[in]  depthwise_weights   Depthwise convolution weights tensor. These are 3D tensors with dimensions [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+     * @param[in]  depthwise_biases    (Optional) Biases tensor.Biases are 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
+     *                                 Data type supported: Same as @p weights.
+     * @param[out] depthwise_out       Depthwise destination tensor.
+     * @param[in]  pointwise_weights   Pointwise convolution weights tensor. These are 4D tensors with dimensions [1, 1, IFM, OFM]. Data type supported: Same as @p input.
+     * @param[in]  pointwise_biases    (Optional) Biases tensor. Biases are 1D tensor with dimensions [OFM]. Must be nullptr if not needed.
+     *                                 Data type supported: Same as @p weights.
+     * @param[out] output              Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                                 Data types supported: Same as @p input.
+     * @param[in]  depthwise_conv_info Contains padding and stride information described in @ref PadStrideInfo for depthwise convolution.
+     * @param[in]  pointwise_conv_info Contains padding and stride information described in @ref PadStrideInfo for pointwise convolution.
+     */
+    void configure(ITensor *input, const ITensor *depthwise_weights, const ITensor *depthwise_biases, ITensor *depthwise_out,
+                   const ITensor *pointwise_weights, const ITensor *pointwise_biases, ITensor *output,
+                   const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info);
+
+    // Inherited methods overriden:
+    void run() override;
+
+private:
+    NEDepthwiseConvolutionLayer _depthwise_conv;
+    NEDirectConvolutionLayer    _pointwise_conv;
+};
+}
+#endif /*__ARM_COMPUTE_NEON_DEPTHWISE_SEPARABLE_CONVOLUTION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDilate.h b/arm_compute/runtime/NEON/functions/NEDilate.h
index 17bdb3363..3b795f57e 100644
--- a/arm_compute/runtime/NEON/functions/NEDilate.h
+++ b/arm_compute/runtime/NEON/functions/NEDilate.h
@@ -49,7 +49,7 @@ public:
      * @param[in]      border_mode           Border mode to use for the convolution.
      * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value);
+    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
 };
 }
 #endif /*__ARM_COMPUTE_NEDILATE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index daaf18f29..09a54968b 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -51,23 +51,43 @@ public:
     /** Constructor */
     NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input, weights, biases and output tensors.
-      *
-      * @note: DirectConvolution only works in the following configurations:
-      *    1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = QS8/QS16/F16/F32
-      *    3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = QS8/F16/F32
-      *    5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32
-      *
-      * @param[in, out] input     Input tensor. Data types supported: QS8/QS16/F16/F32.
-      * @param[in]      weights   Set of kernels to convolve the input volume.
-      *                           Supported sizes: 1x1, 3x3 and 5x5.
-      *                           The 3rd dimension must be the same as the input's volume 3rd dimension.
-      *                           Data type supported: Same as @p input.
-      * @param[in]      bias      Set of biases. Data type supported: Same as @p input.
-      * @param[out]     output    Output tensor.
-      *                           The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
-      * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
-      */
+     *
+     * @note: DirectConvolution only works in the following configurations:
+     *    1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = QS8/QS16/F16/F32
+     *    3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = QS8/F16/F32
+     *    5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32
+     *
+     * @param[in, out] input     Input tensor. Data types supported: QS8/QS16/F16/F32.
+     * @param[in]      weights   Set of kernels to convolve the input volume.
+     *                           Supported sizes: 1x1, 3x3 and 5x5.
+     *                           The 3rd dimension must be the same as the input's volume 3rd dimension.
+     *                           Data type supported: Same as @p input.
+     * @param[in]      bias      Set of biases. Can be nullptr. Data type supported: Same as @p input.
+     * @param[out]     output    Output tensor.
+     *                           The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+     * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     */
     void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayer
+     *
+     * @note: DirectConvolution only works in the following configurations:
+     *    1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = QS8/QS16/F16/F32
+     *    3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = QS8/F16/F32
+     *    5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32
+     *
+     * @param[in] input     Input tensor. Data types supported: QS8/QS16/F16/F32.
+     * @param[in] weights   Set of kernels to convolve the input volume.
+     *                      Supported sizes: 1x1, 3x3 and 5x5.
+     *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
+     *                      Data type supported: Same as @p input.
+     * @param[in] bias      Set of biases. Can be nullptr. Data type supported: Same as @p input.
+     * @param[in] output    Output tensor.
+     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info);
 
     // Inherited methods overridden:
     void run() override;
@@ -78,6 +98,7 @@ private:
     NEDirectConvolutionLayerKernel               _conv_kernel;
     NEFillBorderKernel                           _input_border_handler;
     Tensor                                       _accumulator;
+    bool                                         _has_bias;
 };
 }
 #endif /* __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEErode.h b/arm_compute/runtime/NEON/functions/NEErode.h
index 940ae1847..739e981a9 100644
--- a/arm_compute/runtime/NEON/functions/NEErode.h
+++ b/arm_compute/runtime/NEON/functions/NEErode.h
@@ -49,7 +49,7 @@ public:
      * @param[in]      border_mode           Border mode to use for the convolution.
      * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value);
+    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
 };
 }
 #endif /*__ARM_COMPUTE_NEERODE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowp.h b/arm_compute/runtime/NEON/functions/NEGEMMLowp.h
deleted file mode 100644
index 0b0a7742f..000000000
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowp.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEGEMMLOWP_H__
-#define __ARM_COMPUTE_NEGEMMLOWP_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute GEMMLowp on NEON. This function calls the following NEON kernels:
-*
-*  -# @ref NEGEMMInterleave4x4Kernel
-*  -# @ref NEGEMMTranspose1xWKernel
-*  -# @ref NEGEMMLowpMatrixMultiplyKernel
-*
-*/
-class NEGEMMLowp : public IFunction
-{
-public:
-    /** Constructor */
-    NEGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialise the kernel's inputs, output
-    *
-    * @note GEMM_LOWP:  low precision GEMM kernel
-    *  This kernel performs the following computation:
-    *
-    *  -# Convert a values from uint8 to int32 and add a_offset to each of them.
-    *  -# Convert b values from uint8 to int32 and add b_offset to each of them.
-    *  -# Compute the int32 matrix product of the resulting a * b.
-    *  -# Add output_offset to each entry of the result.
-    *  -# Multiply each entry of the result and round to the nearest integer
-    *  -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
-    *
-    * @param[in]  a               First input tensor  (Matrix A). Data type supported: U8.
-    * @param[in]  b               Second input tensor (Matrix B). Data type supported: same as @p a
-    * @param[out] output          Output tensor. Data type supported: same as @p a.
-    * @param[in]  a_offset        Offset to be added to each element of the matrix A.
-    * @param[in]  b_offset        Offset to be added to each element of the matrix B.
-    * @param[in]  output_offset   Offset to be added to each element of the output matrix
-    * @param[in]  output_mult_int Value to be multiplied to each element of the output matrix
-    * @param[in]  shift           Number of bits to shift right the result.
-    */
-    void configure(const ITensor *a, const ITensor *b, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                    _memory_group;
-    NEGEMMInterleave4x4Kernel      _interleave_kernel;
-    NEGEMMTranspose1xWKernel       _transpose_kernel;
-    NEGEMMLowpMatrixMultiplyKernel _mm_kernel;
-    Tensor                         _tmp_a;
-    Tensor                         _tmp_b;
-};
-}
-#endif /*__ARM_COMPUTE_NEGEMMLOWP_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
new file mode 100644
index 000000000..3d213a766
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
@@ -0,0 +1,69 @@
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H__
+#define __ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute matrix multiply assembly kernels.
+ *
+*/
+class NEGEMMLowpAssemblyMatrixMultiplyCore : public IFunction
+{
+public:
+    /** Constructor */
+    NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Initialise the kernel's inputs, output
+     *
+     * @param[in]  a      First input tensor  (Matrix A). Data type supported: U8, S8.
+     * @param[in]  b      Second input tensor (Matrix B). Data type supported: same as @p a
+     * @param[out] output Output tensor. Data type supported: Data type supported: S32
+     */
+    void configure(const ITensor *a, const ITensor *b, ITensor *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    MemoryGroup                _memory_group;
+    std::unique_ptr<INEKernel> _mm_kernel;
+    std::unique_ptr<INEKernel> _mtx_a_reshape_kernel;
+    std::unique_ptr<INEKernel> _mtx_b_reshape_kernel;
+    Tensor                     _tmp_a;
+    Tensor                     _tmp_b;
+    Tensor                     _workspace;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
new file mode 100644
index 000000000..46e6b494f
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__
+#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute GEMMLowpMatrixMultiplyCore on NEON. This function calls the following NEON kernels if the DOT product instruction is not available:
+ *
+ *  -# @ref NEGEMMInterleave4x4Kernel
+ *  -# @ref NEGEMMTranspose1xWKernel
+ *  -# @ref NEGEMMLowpMatrixMultiplyKernel
+ *  -# @ref NEGEMMLowpOffsetContributionKernel
+ *
+ * otherwise if the DOT product instruction is available:
+ *
+ *  -# @ref NEGEMMInterleaveBlockedKernel
+ *  -# @ref NEGEMMLowpAArch64V8P4Kernel
+ *  -# @ref NEGEMMLowpOffsetContributionKernel
+ *
+*/
+class NEGEMMLowpMatrixMultiplyCore : public IFunction
+{
+public:
+    /** Constructor */
+    NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Initialise the kernel's inputs, output
+     *
+     * @note GEMM_LOWP:  low precision GEMM kernel
+     *  This kernel performs the following computations:
+     *
+     *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
+     *  -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
+     *  -# Compute the matrix product of the resulting a * b in int32.
+     *
+     * @param[in]  a         First input tensor  (Matrix A). Data type supported: QASYMM8.
+     * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a
+     * @param[out] output    Output tensor. Data type supported: Data type supported: S32
+     * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
+     *                       if the reshape of matrix B should be executed only for the first run
+     */
+    void configure(const ITensor *a, const ITensor *b, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore
+     *
+     * @param[in]  a         First input tensor  (Matrix A). Data type supported: QASYMM8.
+     * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a
+     * @param[out] output    Output tensor. Data type supported: Data type supported: S32
+     * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
+     *                       if the reshape of matrix B should be executed only for the first run
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    MemoryGroup                        _memory_group;
+    std::unique_ptr<INEKernel>         _mm_kernel;
+    std::unique_ptr<INEKernel>         _mtx_a_reshape_kernel;
+    std::unique_ptr<INEKernel>         _mtx_b_reshape_kernel;
+    NEGEMMLowpMatrixAReductionKernel   _mtx_a_reduction_kernel;
+    NEGEMMLowpMatrixBReductionKernel   _mtx_b_reduction_kernel;
+    NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel;
+    Tensor                             _vector_sum_col;
+    Tensor                             _vector_sum_row;
+    Tensor                             _tmp_a;
+    Tensor                             _tmp_b;
+    Tensor                             _workspace;
+    int32_t                            _a_offset;
+    int32_t                            _b_offset;
+    bool                               _run_vector_matrix_multiplication;
+    bool                               _dot_product_path;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
new file mode 100644
index 000000000..7da0d2359
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__
+#define __ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+/** This file contains all available output stages for GEMMLowp on NEON.
+ *
+ *  In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyCore),
+ *  and processes it to obtain the final ASYMM8 value.
+ *
+ *  More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md
+ */
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8Scale on NEON.
+ *
+ *  NEGEMMLowpQuantizeDownInt32ToUint8Scale depends on 3 parameters: result_offset, result_mult_int, result_shift
+ *  The final result is:
+ *
+ *  ((input[i][k] + result_offset) * result_mult_int) >> result_shift
+ *
+ * In case the bias tensor is provided, the final result is:
+ *
+ *  ((input[i][k] + bias[k] + result_offset) * result_mult_int) >> result_shift
+ *
+ *  This function calls the following NEON kernels:
+ *
+ * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel
+ *
+ * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
+ *       after the result is shifted right by result_shift
+*/
+class NEGEMMLowpQuantizeDownInt32ToUint8Scale : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output
+     *
+     * @param[in]  input           Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in]  result_offset   Offset to be added to each element of the input matrix
+     * @param[in]  result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift    Number of bits to shift right the result before converting back to QASYMM8
+     * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     */
+    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8Scale
+     *
+     * @param[in] input  Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
+     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+};
+
+/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on NEON.
+ *
+ *  NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters:
+ *
+ *  result_fixedpoint_multiplier, result_shift, result_offset_after_shift
+ *
+ * The final result is:
+ *
+ * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
+ *
+ * where FixedPointMul(x, y) is the nearest integer to the following
+ * mathematical expression, evaluated without overflow or intermediate rounding:
+ *
+ * (x * y) / 2^31
+ *
+ * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
+ *
+ * In case the bias tensor is provided, the final result is:
+ *
+ * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
+ *
+ *  This function calls the following NEON kernels:
+ *
+ * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+ *
+ * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
+ *       after the result is shifted right by result_shift
+*/
+class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output
+     *
+     * @param[in]  input                        Input tensor. Data type supported: S32
+     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
+     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
+     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
+     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     */
+    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
+     *
+     * @param[in] input  Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
+     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__ */
+\ No newline at end of file
diff --git a/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h b/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h
index b4ed56a0c..dbe0ecdf6 100644
--- a/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h
+++ b/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h
@@ -91,7 +91,8 @@ public:
     void run() override;
 
 private:
-    std::unique_ptr<NEFillBorderKernel[]>          _border_handler;
+    std::unique_ptr<NEFillBorderKernel[]>          _horizontal_border_handler;
+    std::unique_ptr<NEFillBorderKernel[]>          _vertical_border_handler;
     std::unique_ptr<NEGaussianPyramidHorKernel[]>  _horizontal_reduction;
     std::unique_ptr<NEGaussianPyramidVertKernel[]> _vertical_reduction;
 };
diff --git a/arm_compute/runtime/NEON/functions/NEIm2Col.h b/arm_compute/runtime/NEON/functions/NEIm2Col.h
new file mode 100644
index 000000000..cb08f5cd0
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEIm2Col.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEIM2COL_H__
+#define __ARM_COMPUTE_NEIM2COL_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEIm2ColKernel */
+class NEIm2Col : public INESimpleFunction
+{
+public:
+    /** Configure the im2col NEON kernel
+     *
+     * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32
+     *                         Note: QASYMM8 works only for has_bias = false
+     * @param[out] output      The output tensor. Data types supported: Same as @p input
+     * @param[in]  kernel_dims The kernel dimensions (width and height).
+     * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
+     */
+    void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEIm2Col
+     *
+     * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                        while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32
+     *                        Note: QASYMM8 works only for has_bias = false
+     * @param[in] output      The output tensor. Data types supported: Same as @p input
+     * @param[in] kernel_dims The kernel dimensions (width and height).
+     * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in] has_bias    In case biases are provided expands the matrix with 1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
+};
+}
+#endif /* __ARM_COMPUTE_NEIM2COL_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEIntegralImage.h b/arm_compute/runtime/NEON/functions/NEIntegralImage.h
index 6d7dd697e..1ac501c99 100644
--- a/arm_compute/runtime/NEON/functions/NEIntegralImage.h
+++ b/arm_compute/runtime/NEON/functions/NEIntegralImage.h
@@ -35,10 +35,10 @@ class NEIntegralImage : public INESimpleFunction
 {
 public:
     /** Initialise the function's source, destinations and border mode.
-        *
-        * @param[in]  input  Source tensor. Data type supported: U8.
-        * @param[out] output Destination tensor. Data type supported: U32.
-        */
+     *
+     * @param[in]  input  Source tensor. Data type supported: U8.
+     * @param[out] output Destination tensor. Data type supported: U32.
+     */
     void configure(const ITensor *input, ITensor *output);
 };
 }
diff --git a/arm_compute/runtime/NEON/functions/NEL2Normalize.h b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
index 95d5186c1..100e23940 100644
--- a/arm_compute/runtime/NEON/functions/NEL2Normalize.h
+++ b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
@@ -24,7 +24,7 @@
 #ifndef __ARM_COMPUTE_NEL2NORMALIZE_H__
 #define __ARM_COMPUTE_NEL2NORMALIZE_H__
 
-#include "arm_compute/core/NEON/kernels/NEL2NormalizeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -41,13 +41,13 @@ class ITensor;
  *
  * This function runs the following kernels:
  * -# @ref NEReductionOperation
- * -# @ref NEL2NormalizeKernel
+ * -# @ref NEL2NormalizeLayerKernel
  */
-class NEL2Normalize : public IFunction
+class NEL2NormalizeLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEL2Normalize(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
      * @param[in, out] input   Source tensor. Data types supported: F32. (Written to only for border_size != 0)
@@ -61,10 +61,10 @@ public:
     void run() override;
 
 private:
-    MemoryGroup          _memory_group;
-    NEReductionOperation _reduce_func;
-    NEL2NormalizeKernel  _normalize_kernel;
-    Tensor               _sumsq;
+    MemoryGroup              _memory_group;
+    NEReductionOperation     _reduce_func;
+    NEL2NormalizeLayerKernel _normalize_kernel;
+    Tensor                   _sumsq;
 };
 }
 #endif /* __ARM_COMPUTE_NEL2NORMALIZE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h b/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h
index 991ae7c29..baa4b7b1a 100644
--- a/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h
+++ b/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h
@@ -27,7 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
 #include "arm_compute/runtime/Pyramid.h"
@@ -79,7 +79,7 @@ private:
     std::unique_ptr<NEArithmeticSubtraction[]> _subf;
     Pyramid                                    _gauss_pyr;
     Pyramid                                    _conv_pyr;
-    NEDepthConvert                             _depth_function;
+    NEDepthConvertLayer                        _depth_function;
 };
 }
 #endif /*__ARM_COMPUTE_NELAPLACIANPYRAMID_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h b/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h
index 413973349..3d423607a 100644
--- a/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h
+++ b/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h
@@ -27,7 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEScale.h"
 #include "arm_compute/runtime/Pyramid.h"
 
@@ -43,7 +43,7 @@ using IImage = ITensor;
  *
  * -# @ref NEArithmeticAddition
  * -# @ref NEScale
- * -# @ref NEDepthConvert
+ * -# @ref NEDepthConvertLayer
  *
  * This function reconstructs the original image from a Laplacian Image Pyramid.
  *
@@ -85,7 +85,7 @@ private:
     Pyramid                                 _tmp_pyr;
     std::unique_ptr<NEArithmeticAddition[]> _addf;
     std::unique_ptr<NEScale[]>              _scalef;
-    NEDepthConvert                          _depthf;
+    NEDepthConvertLayer                     _depthf;
 };
 }
 #endif /*__ARM_COMPUTE_NELAPLACIANRECONSTRUCT_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEMagnitude.h b/arm_compute/runtime/NEON/functions/NEMagnitude.h
index 6c1f988ef..5bc3faf66 100644
--- a/arm_compute/runtime/NEON/functions/NEMagnitude.h
+++ b/arm_compute/runtime/NEON/functions/NEMagnitude.h
@@ -39,9 +39,10 @@ public:
      * @param[in]  input1   First tensor input. Data type supported: S16.
      * @param[in]  input2   Second tensor input. Data type supported: S16.
      * @param[out] output   Output tensor. Data type supported: S16.
+     * @param[in]  mag_type (Optional) Magnitude calculation type. Default: L2NORM.
      * @param[in]  use_fp16 (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, bool use_fp16 = false);
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, MagnitudeType mag_type = MagnitudeType::L2NORM, bool use_fp16 = false);
 };
 }
 #endif /*__ARM_COMPUTE_NEMAGNITUDE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
index 1c95c5bc4..4b5ad2870 100644
--- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
@@ -40,7 +40,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Basic function to simulate a normalization layer. This function calls the following NEON kernels:
+/** Basic function to compute a normalization layer. This function calls the following NEON kernels:
  *
  * -# @ref NEPixelWiseMultiplicationKernel
  * -# @ref NEFillBorderKernel
@@ -55,11 +55,21 @@ public:
     /** Set the input and output tensors.
      *
      * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                       and an optional 4th dimension for batch of inputs. Data type supported: QS8/F16/F32
+     *                       and an optional 4th dimension for batch of inputs. Data type supported: QS8/QS16/F16/F32
      * @param[out] output    Destination with the same dimensions, data type and number of channels of  @p input
      * @param[in]  norm_info Normalization layer information like the normalization type, normalization size and other parameters.
      */
-    void configure(const ITensor *input, ITensor *output, NormalizationLayerInfo norm_info);
+    void configure(const ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayer
+     *
+     * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                      and an optional 4th dimension for batch of inputs. Data type supported: QS8/QS16/F16/F32
+     * @param[in] output    Destination with the same dimensions, data type and number of channels of  @p input
+     * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEPhase.h b/arm_compute/runtime/NEON/functions/NEPhase.h
index 985ba84c4..cd62cf98e 100644
--- a/arm_compute/runtime/NEON/functions/NEPhase.h
+++ b/arm_compute/runtime/NEON/functions/NEPhase.h
@@ -36,11 +36,12 @@ class NEPhase : public INESimpleFunction
 public:
     /** Initialise the kernel's inputs, output.
      *
-     * @param[in]  input1 First tensor input. Data type supported: S16.
-     * @param[in]  input2 Second tensor input. Data type supported: S16.
-     * @param[out] output Output tensor. Data type supported: U8.
+     * @param[in]  input1     First tensor input. Data type supported: S16.
+     * @param[in]  input2     Second tensor input. Data type supported: S16.
+     * @param[out] output     Output tensor. Data type supported: U8.
+     * @param[in]  phase_type (Optional) Phase calculation type. Default: SIGNED.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, PhaseType phase_type = PhaseType::SIGNED);
 };
 }
 #endif /*__ARM_COMPUTE_NEPHASE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index de7a797cd..7d22500c5 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -45,6 +45,18 @@ public:
      * @param[in]  rounding_policy Rounding policy.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication
+     *
+     * @param[in] input1          First tensor info input. Data types supported: U8/QS8/S16/F32.
+     * @param[in] input2          Second tensor info input. Data types supported: U8/QS8/S16/F32.
+     * @param[in] output          Output tensor info. Data types supported: U8/QS8/S16/F32.
+     * @param[in] scale           Scale to apply after multiplication. Must be positive.
+     * @param[in] overflow_policy Overflow policy.
+     * @param[in] rounding_policy Rounding policy.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
 };
 }
 #endif /*__ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index 7b038aaa5..0f8abb587 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -53,6 +53,17 @@ public:
      * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
     void configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEPoolingLayer
+     *
+     * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only
+     *
+     * @param[in] input     Source tensor. (Written to only when padding != 0) Data types supported: QS8/QS16/F16/F32.
+     * @param[in] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
index 5adc1110d..69a90dd89 100644
--- a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
@@ -35,7 +35,7 @@ class ITensor;
 
 /** Basic function to run @ref NEROIPoolingLayerKernel.
  *
- * This function calls the following OpenCL kernels:
+ * This function calls the following NEON kernels:
  * -# @ref NEROIPoolingLayerKernel
  *
  */
diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h
index 7297880a7..1d96db3ff 100644
--- a/arm_compute/runtime/NEON/functions/NEScale.h
+++ b/arm_compute/runtime/NEON/functions/NEScale.h
@@ -52,8 +52,10 @@ public:
      * @param[in]      policy                The interpolation type.
      * @param[in]      border_mode           Strategy to use for borders.
      * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     * @param[in]      sampling_policy       (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
      */
-    void configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue());
+    void configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(),
+                   SamplingPolicy sampling_policy = SamplingPolicy::CENTER);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
index a265f7004..5043f79c2 100644
--- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
@@ -53,8 +53,18 @@ public:
      *
      * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32.
      * @param[out] output Destination tensor. Data types supported: same as @p input.
+     * @param[in]  beta   (Optional) A scaling factor for the exponent. QS8/QS16 only support a beta value of 1.
      */
-    void configure(ITensor *input, ITensor *output);
+    void configure(ITensor *input, ITensor *output, float beta = 1.0f);
+    /** Static function to check if given info will lead to a valid configuration of @ref NESoftmaxLayer
+     *
+     * @param[in] input  Source tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in] output Destination tensor. Data types supported: same as @p input
+     * @param[in] beta   (Optional) A scaling factor for the exponent. QS8/QS16 only support a beta value of 1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta = 1.0f);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NETranspose.h b/arm_compute/runtime/NEON/functions/NETranspose.h
index 4b606e728..6d1e10708 100644
--- a/arm_compute/runtime/NEON/functions/NETranspose.h
+++ b/arm_compute/runtime/NEON/functions/NETranspose.h
@@ -41,10 +41,18 @@ class NETranspose : public INESimpleFunction
 public:
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NETranspose
+     *
+     * @param[in] input  The input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in] output The output tensor. Data types supported: Same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 };
 }
 
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h
new file mode 100644
index 000000000..77707060e
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEWINOGRADLAYER_H__
+#define __ARM_COMPUTE_NEWINOGRADLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+/** Basic function to simulate a convolution layer. This function calls the following NEON kernels:
+ */
+class NEWinogradLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs.
+     *                       Data types supported: F32.
+     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
+     *                       Currently only 3x3 kernels are supported.
+     * @param[in]  biases    Not supported, biases will be ignored.
+     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                       Data types supported: Same as @p input.
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo. Currently only unit strides are supported.
+     */
+    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWinogradLayer(const NEWinogradLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWinogradLayer &operator=(const NEWinogradLayer &) = delete;
+
+private:
+    MemoryGroup                     _memory_group;
+    NEWinogradLayerKernel           _winograd_kernel;
+    Tensor                          _weights_workspace;
+    Tensor                          _workspace;
+    Tensor                          _kernel_storage;
+    const ITensor                  *_input;
+    const ITensor                  *_weights;
+    ITensor                        *_output;
+    bool                            _reshaped_kernel;
+    std::unique_ptr<Winograd3x3F32> _conv;
+};
+}
+#endif /* __ARM_COMPUTE_NEWINOGRADLAYER_H__ */
diff --git a/arm_compute/runtime/OffsetLifetimeManager.h b/arm_compute/runtime/OffsetLifetimeManager.h
new file mode 100644
index 000000000..e39d6a0d6
--- /dev/null
+++ b/arm_compute/runtime/OffsetLifetimeManager.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_OFFSETLIFETIMEMANAGER_H__
+#define __ARM_COMPUTE_OFFSETLIFETIMEMANAGER_H__
+
+#include "arm_compute/runtime/ISimpleLifetimeManager.h"
+
+#include "arm_compute/runtime/Types.h"
+
+#include <cstddef>
+#include <map>
+#include <vector>
+
+namespace arm_compute
+{
+class IMemoryPool;
+
+/** Concrete class that tracks the lifetime of registered tensors and
+ *  calculates the systems memory requirements in terms of a single blob and a list of offsets */
+class OffsetLifetimeManager : public ISimpleLifetimeManager
+{
+public:
+    /** Constructor */
+    OffsetLifetimeManager();
+    /** Prevent instances of this class to be copy constructed */
+    OffsetLifetimeManager(const OffsetLifetimeManager &) = delete;
+    /** Prevent instances of this class to be copied */
+    OffsetLifetimeManager &operator=(const OffsetLifetimeManager &) = delete;
+    /** Allow instances of this class to be move constructed */
+    OffsetLifetimeManager(OffsetLifetimeManager &&) = default;
+    /** Allow instances of this class to be moved */
+    OffsetLifetimeManager &operator=(OffsetLifetimeManager &&) = default;
+
+    // Inherited methods overridden:
+    std::unique_ptr<IMemoryPool> create_pool(IAllocator *allocator) override;
+    MappingType mapping_type() const override;
+
+private:
+    // Inherited methods overridden:
+    void update_blobs_and_mappings() override;
+
+private:
+    size_t _blob; /**< Memory blob size */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_OFFSETLIFETIMEMANAGER_H__ */
diff --git a/arm_compute/runtime/OffsetMemoryPool.h b/arm_compute/runtime/OffsetMemoryPool.h
new file mode 100644
index 000000000..9685fd131
--- /dev/null
+++ b/arm_compute/runtime/OffsetMemoryPool.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_OFFSETMEMORYPOOL_H__
+#define __ARM_COMPUTE_OFFSETMEMORYPOOL_H__
+
+#include "arm_compute/runtime/IMemoryPool.h"
+
+#include "arm_compute/runtime/Types.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+class IAllocator;
+
+/** Offset based memory pool */
+class OffsetMemoryPool : public IMemoryPool
+{
+public:
+    /** Default Constructor
+     *
+     * @note allocator should outlive the memory pool
+     *
+     * @param[in] allocator Backing memory allocator
+     * @param[in] blob_size Size of the memory be allocated
+     */
+    OffsetMemoryPool(IAllocator *allocator, size_t blob_size);
+    /** Default Destructor */
+    ~OffsetMemoryPool();
+    /** Prevent instances of this class to be copy constructed */
+    OffsetMemoryPool(const OffsetMemoryPool &) = delete;
+    /** Prevent instances of this class to be copy assigned */
+    OffsetMemoryPool &operator=(const OffsetMemoryPool &) = delete;
+    /** Allow instances of this class to be move constructed */
+    OffsetMemoryPool(OffsetMemoryPool &&) = default;
+    /** Allow instances of this class to be move assigned */
+    OffsetMemoryPool &operator=(OffsetMemoryPool &&) = default;
+
+    // Inherited methods overridden:
+    void acquire(MemoryMappings &handles) override;
+    void release(MemoryMappings &handles) override;
+    MappingType                  mapping_type() const override;
+    std::unique_ptr<IMemoryPool> duplicate() override;
+
+private:
+    IAllocator *_allocator; /**< Allocator to use for internal allocation */
+    void       *_blob;      /**< Memory blob */
+    size_t      _blob_size; /**< Sizes of the allocated memory blob */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_OFFSETMEMORYPOOL_H__ */
diff --git a/arm_compute/runtime/TensorAllocator.h b/arm_compute/runtime/TensorAllocator.h
index 40704c0a1..9af100c12 100644
--- a/arm_compute/runtime/TensorAllocator.h
+++ b/arm_compute/runtime/TensorAllocator.h
@@ -25,6 +25,7 @@
 #define __ARM_COMPUTE_TENSORALLOCATOR_H__
 
 #include "arm_compute/runtime/ITensorAllocator.h"
+#include "arm_compute/runtime/Memory.h"
 
 #include <cstdint>
 #include <memory>
@@ -86,6 +87,19 @@ public:
      *
      */
     void free() override;
+    /** Import an existing memory as a tensor's backing memory
+     *
+     * @warning If the tensor is flagged to be managed by a memory manager,
+     *          this call will lead to an error.
+     * @warning Ownership of memory depends on the way the @ref Memory object was constructed
+     * @note    Calling free on a tensor with imported memory will just clear
+     *          the internal pointer value.
+     *
+     * @param[in] memory Memory to import
+     *
+     * @return error status
+     */
+    arm_compute::Status import_memory(Memory memory);
     /** Associates the tensor with a memory group
      *
      * @param[in] associated_memory_group Memory group to associate the tensor with
@@ -104,7 +118,7 @@ protected:
 
 private:
     MemoryGroup *_associated_memory_group; /**< Registered memory manager */
-    uint8_t     *_buffer;                  /**< CPU memory allocation. */
+    Memory       _memory;                  /**< CPU memory */
     Tensor      *_owner;                   /**< Owner of the allocator */
 };
 }
author	Anthony Barbier <Anthony.barbier@arm.com>	2017-12-14 23:48:46 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-01-24 10:01:21 +0000
commit	8140e1e155d3430992fa46e04ef8938ff09ffd2d (patch)
tree	9bcf86d01635bfc73e8debd1bda75e6f75b8b406 /arm_compute
parent	8a3da6f91f90c566b844d568f4ec43b946915af8 (diff)
download	armcl-8140e1e155d3430992fa46e04ef8938ff09ffd2d.tar.gz armcl-8140e1e155d3430992fa46e04ef8938ff09ffd2d.tar.bz2 armcl-8140e1e155d3430992fa46e04ef8938ff09ffd2d.zip