Imported Upstream version 1.12.0upstream/1.12.0

author: Chunseok Lee <chunseok.lee@samsung.com> 2020-12-14 14:43:43 +0900
committer: Chunseok Lee <chunseok.lee@samsung.com> 2020-12-14 14:43:43 +0900
commit: 62529acabbafce7730601ed01d5709d7bc0d378a (patch)
tree: bf6912cfa8fac4a2997292bfcb3c82055734c97e /compute
parent: 6ea13af5257155ff993c205cf997b870cc627f73 (diff)
download: nnfw-62529acabbafce7730601ed01d5709d7bc0d378a.tar.gz
nnfw-62529acabbafce7730601ed01d5709d7bc0d378a.tar.bz2
nnfw-62529acabbafce7730601ed01d5709d7bc0d378a.zip
137 files changed, 5543 insertions, 2371 deletions
diff --git a/compute/.clang-format b/compute/.clang-format
new file mode 120000
index 000000000..0ff66f331
--- /dev/null
+++ b/compute/.clang-format
@@ -0,0 +1 @@
+../.clang-format.8
+\ No newline at end of file
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
index d29886a9d..4a3717885 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
@@ -255,14 +255,14 @@ private:
   cl::Device _device;       /**< Underlying CL device. */
   std::string _kernel_path; /**< Path to the kernels folder. */
   mutable std::map<std::string, const Program>
-      _programs_map; /**< Map with all already loaded program data. */
+    _programs_map; /**< Map with all already loaded program data. */
   mutable std::map<std::string, cl::Program>
-      _built_programs_map; /**< Map with all already built program data. */
+    _built_programs_map; /**< Map with all already built program data. */
   static const std::map<std::string, std::string>
-      _kernel_program_map; /**< Map that associates kernel names with programs. */
+    _kernel_program_map; /**< Map that associates kernel names with programs. */
   static const std::map<std::string, std::string>
-      _program_source_map; /**< Contains sources for all programs.
-                                Used for compile-time kernel inclusion. >*/
+    _program_source_map; /**< Contains sources for all programs.
+                           Used for compile-time kernel inclusion. >*/
 };
 }
 #endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
index a614d5259..fb689f747 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
@@ -54,8 +54,8 @@ namespace arm_compute
 class ICLTensor;
 
 /**
-* @brief Class to perform EmbeddingLookup operation with opencl kernel
-*/
+ * @brief Class to perform EmbeddingLookup operation with opencl kernel
+ */
 class CLEmbeddingLookupKernel : public ICLKernel
 {
 public:
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
index 99cfa61ec..96f830898 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
@@ -55,8 +55,8 @@ namespace arm_compute
 class ICLTensor;
 
 /**
-* @brief Class to perform HashtableLookup operation with opencl kernel
-*/
+ * @brief Class to perform HashtableLookup operation with opencl kernel
+ */
 class CLHashtableLookupKernel : public ICLKernel
 {
 public:
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
index 99bb351bc..963d7b821 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
@@ -68,34 +68,37 @@ public:
   const char *name() const override { return "NEOneHotKernel"; }
   /** Initialise the kernel's inputs and outputs
    *
- * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
- * following types: U32/S32
- * @param[in]  depth     The tensor for depth of the one hot dimension. Supported tensor rank: up to
- * 3. Must be one of the following types: U32/S32
- * @param[in]  on_value  On value tensor. Supported tensor rank: only 1. Data type supported:
- * U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  off_value Off value tensor. Supported tensor rank: only 1. Data type supported: Same
- * as @p on_value
- * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
- * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
- * The value must be in range [-indices.rank , indices.rank)
+   * @param[in]  indices   Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   *                       following types: U32/S32
+   * @param[in]  depth     The tensor for depth of the one hot dimension.
+   *                       Supported tensor rank: up to 3.
+   *                       Must be one of the following types: U32/S32
+   * @param[in]  on_value  On value tensor. Supported tensor rank: only 1.
+   *                       Data type supported: U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  off_value Off value tensor. Supported tensor rank: only 1.
+   *                       Data type supported: Same as @p on_value
+   * @param[out] output    Destination tensor. Data type supported: Same as @p on_value
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around.
+   *                       Defaults to -1.
+   *                       The value must be in range [-indices.rank , indices.rank)
    */
   void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
                  const ITensor *off_value, ITensor *output, int axis = -1);
   /** Static function to check if given info will lead to a valid configuration of @ref
- * NEOneHotKernel
+   * NEOneHotKernel
    *
- * @param[in]  indices   Indices tensor info. Supported tensor rank: up to 3. Must be one of the
- * following types: U32/S32
- * @param[in]  depth     The tensor info for depth of the one hot dimension. Supported tensor rank:
- * up to 3. Must be one of the following types: U32/S32
- * @param[in]  on_value  On value tensor info. Supported tensor rank: only 1. Data type supported:
- * U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  off_value Off value tensor info. Supported tensor rank: only 1. Data type supported:
- * Same as @p on_value
- * @param[out] output    Destination tensor info. Data type supported: Same as @p on_value
- * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
- * The value must be in range [-indices.rank , indices.rank)
+   * @param[in]  indices   Indices tensor info. Supported tensor rank: up to 3.
+   *                       Must be one of the following types: U32/S32
+   * @param[in]  depth     The tensor info for depth of the one hot dimension.
+   *                       Supported tensor rank: up to 3.
+   *                       Must be one of the following types: U32/S32
+   * @param[in]  on_value  On value tensor info. Supported tensor rank: only 1.
+   *                       Data type supported: U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  off_value Off value tensor info. Supported tensor rank: only 1.
+   *                       Data type supported: Same as @p on_value
+   * @param[out] output    Destination tensor info. Data type supported: Same as @p on_value
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   *                       The value must be in range [-indices.rank , indices.rank)
    *
    * @return a status
    */
diff --git a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
index 1e69f0912..2aaab6b3a 100644
--- a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
@@ -72,10 +72,10 @@ namespace shape_calculator
  * @return the calculated shape
  */
 inline TensorShape compute_transposeconv_upsampled_shape(
-    const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info,
-    std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right,
-    unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right,
-    unsigned int &pad_top, unsigned int &pad_bottom)
+  const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info,
+  std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right,
+  unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right,
+  unsigned int &pad_top, unsigned int &pad_bottom)
 {
   unsigned int sx = info.stride().first;
   unsigned int sy = info.stride().second;
@@ -103,7 +103,7 @@ inline TensorShape compute_transposeconv_upsampled_shape(
 
   unsigned int padx_all_except_invallid = padx + info.pad_left() + info.pad_right() - invalid_right;
   unsigned int pady_all_except_invallid =
-      pady + info.pad_top() + info.pad_bottom() - invalid_bottom;
+    pady + info.pad_top() + info.pad_bottom() - invalid_bottom;
   pad_left = (padx_all_except_invallid + 1) / 2 - info.pad_left();
   pad_right = pady_all_except_invallid / 2 - info.pad_right() + invalid_right;
   pad_top = (padx_all_except_invallid + 1) / 2 - info.pad_top();
@@ -135,7 +135,7 @@ compute_transposeconv_output_shape(const std::pair<unsigned int, unsigned int> &
   const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
   const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
   const int channel_idx =
-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
   const int batch_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
 
   TensorShape out_shape{input_shape};
@@ -160,7 +160,7 @@ inline TensorShape compute_depth_to_space_shape_ex(const ITensorInfo *input, int
   const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
   const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
   const int idx_channel =
-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
   TensorShape output_shape{input->tensor_shape()};
   output_shape.set(idx_width, input->dimension(idx_width) * block);
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
index 409eaf593..026209f69 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
@@ -106,22 +106,24 @@ public:
   CLDirectTransposeConvLayer &operator=(CLDirectTransposeConvLayer &&) = default;
   /** Set the input, weights, biases and output tensors.
    *
-   * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs.
-   *                             Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
-   * @param[in]     bias         (Optional) The biases have one dimension.
-   *                             Data type supported: Should match @p input data type, except for
- * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
-   * @param[out]    output       Output tensor. The output has the same number of dimensions as the
- * @p input.
-   * @param[in]     info         Contains padding and policies to be used in the deconvolution, this
- * is decribed in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
- * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input,
+   *                                and an optional 4th dimension for batch of inputs.
+   *                                Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM].
+   *                                Data type supported: Same as @p input.
+   * @param[in]     bias            (Optional) The biases have one dimension.
+   *                                Data type supported: Should match @p input data type,
+   *                                except for input of QASYMM8 and QASYMM8_SIGNED type
+   *                                where biases should be of S32 type
+   * @param[out]    output          Output tensor.
+   *                                The output has the same number of dimensions as the @p input.
+   * @param[in]     info            Contains padding and policies to be used in the deconvolution,
+   *                                this is decribed in @ref PadStrideInfo.
+   * @param[in]     invalid_right   The number of zeros added to right edge of the output.
+   * @param[in]     invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info    (Optional) Weights information needed for
+   *                @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with
+   *                @ref CLWeightsReshapeKernel.
    *
    */
   void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
@@ -130,23 +132,24 @@ public:
   /** Set the input, weights, biases and output tensors.
    *
    * @param[in]     compile_context The compile context to be used.
-   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input, and
- * an optional 4th dimension for batch of inputs.
+   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input,
+   *                                 and an optional 4th dimension for batch of inputs.
    *                                Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data
- * type supported: Same as @p input.
+   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM].
+   *                                Data type supported: Same as @p input.
    * @param[in]     bias            (Optional) The biases have one dimension.
    *                                Data type supported: Should match @p input data type, except for
- * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+   *                                input of QASYMM8 and QASYMM8_SIGNED type
+   *                                where biases should be of S32 type
    * @param[out]    output          Output tensor. The output has the same number of dimensions as
- * the @p input.
+   *                                the @p input.
    * @param[in]     info            Contains padding and policies to be used in the deconvolution,
- * this is decribed in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in]     weights_info    (Optional) Weights information needed for @ref
- * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
- * CLWeightsReshapeKernel.
+   *                                this is decribed in @ref PadStrideInfo.
+   * @param[in]     invalid_right   The number of zeros added to right edge of the output.
+   * @param[in]     invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info    (Optional) Weights information needed for
+   *                                @ref CLConvolutionLayer, specifies if the weights tensor has
+   *                                been reshaped with @ref CLWeightsReshapeKernel.
    *
    */
   void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
@@ -154,24 +157,26 @@ public:
                  unsigned int invalid_right, unsigned int invalid_bottom,
                  const WeightsInfo &weights_info = WeightsInfo());
   /** Static function to check if given info will lead to a valid configuration of @ref
- * CLDirectTransposeConvLayer
+   * CLDirectTransposeConvLayer
    *
-   * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs.
-   *                         Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data
- * type supported: Same as @p input.
-   * @param[in] bias         (Optional) The biases have one dimension.
-   *                         Data type supported: Should match @p input data type, except for input
- * of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
-   * @param[in] output       Output tensor info. The output has the same number of dimensions as the
- * @p input.
-   * @param[in] info         Contains padding and policies to be used in the deconvolution, this is
- * decribed in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
- * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   * @param[in] input           Input tensor info. 3 lower dimensions represent a single input,
+   *                            and an optional 4th dimension for batch of inputs.
+   *                            Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in] weights         The 4d weights info with dimensions [width, height, IFM, OFM].
+   *                            Data type supported: Same as @p input.
+   * @param[in] bias            (Optional) The biases have one dimension.
+   *                            Data type supported: Should match @p input data type,
+   *                            except for input of QASYMM8 and QASYMM8_SIGNED type
+   *                            where biases should be of S32 type
+   * @param[in] output          Output tensor info. The output has the same number of dimensions
+   *                            as the @p input.
+   * @param[in] info            Contains padding and policies to be used in the deconvolution,
+   *                            this is decribed in @ref PadStrideInfo.
+   * @param[in] invalid_right   The number of zeros added to right edge of the output.
+   * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in] weights_info    (Optional) Weights information needed for @ref CLConvolutionLayer,
+   *                            specifies if the weights tensor has been reshaped
+   *                            with @ref CLWeightsReshapeKernel.
    *
    * @return a status
    */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
index e65a646dc..f27e9913e 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
@@ -216,7 +216,7 @@ private:
   CLConvertFullyConnectedWeights _convert_weights;
   weights_transformations::CLConvertFullyConnectedWeightsManaged _convert_weights_managed;
   weights_transformations::CLFullyConnectedLayerReshapeWeightsExManaged
-      _reshape_weights_managed_function;
+    _reshape_weights_managed_function;
   CLFlattenLayer _flatten_layer;
   CLFullyConnectedLayerReshapeWeightsEx _reshape_weights_function;
   CLGEMM _mm_gemm;
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h
index 289ab167f..bdb168664 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h
@@ -43,8 +43,8 @@ public:
 
 public:
   CLFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
-      : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{},
-        _memory_manager{memory_manager}, _cl_fc{nullptr}, _cl_reshape{}, _needs_reshape(false)
+    : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{},
+      _memory_manager{memory_manager}, _cl_fc{nullptr}, _cl_reshape{}, _needs_reshape(false)
   {
     // DO NOTHING
   }
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
index b01ec4255..167554c9e 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
@@ -66,7 +66,7 @@ public:
    * @param[out] output  The output tensor, Data types supported: same as @p input.
    * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
    * @return N/A
- */
+   */
   void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
 
   /**
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
index 5fb102e47..5b27d362a 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
@@ -63,20 +63,22 @@ public:
 
   /** Set the input, weights, biases and output tensors.
    *
-   * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
-   * @param[in]     bias         (Optional) The biases have one dimension. Data type supported: Same
- * as @p input.
-   * @param[out]    output       Output tensor. The output has the same number of dimensions as the
- * @p input.
-   * @param[in]     deconv_info  Contains padding and policies to be used in the deconvolution, this
- * is described in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
- * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input,
+   *                                and an optional 4th dimension for batch of inputs.
+   *                                Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM].
+   *                                Data type supported: Same as @p input.
+   * @param[in]     bias            (Optional) The biases have one dimension.
+   *                                Data type supported: Same as @p input.
+   * @param[out]    output          Output tensor. The output has the same number of dimensions
+   *                                as the @p input.
+   * @param[in]     deconv_info     Contains padding and policies to be used in the deconvolution,
+   *                                this is described in @ref PadStrideInfo.
+   * @param[in]     invalid_right   The number of zeros added to right edge of the output.
+   * @param[in]     invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info    (Optional) Weights information needed for
+   *                                @ref CLConvolutionLayer, specifies if the weights tensor has
+   *                                been reshaped with @ref CLWeightsReshapeKernel.
    *
    */
   void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
@@ -85,22 +87,22 @@ public:
   /** Set the input, weights, biases and output tensors.
    *
    * @param[in]     compile_context The compile context to be used.
-   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input, and
- * an optional 4th dimension for batch of inputs. Data types supported:
- * QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data
- * type supported: Same as @p input.
-   * @param[in]     bias            (Optional) The biases have one dimension. Data type supported:
- * Same as @p input.
-   * @param[out]    output          Output tensor. The output has the same number of dimensions as
- * the @p input.
+   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input,
+   *                                and an optional 4th dimension for batch of inputs.
+   *                                Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM].
+   *                                Data type supported: Same as @p input.
+   * @param[in]     bias            (Optional) The biases have one dimension.
+   *                                Data type supported: Same as @p input.
+   * @param[out]    output          Output tensor. The output has the same number of dimensions
+   *                                as the @p input.
    * @param[in]     deconv_info     Contains padding and policies to be used in the deconvolution,
- * this is described in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in]     weights_info    (Optional) Weights information needed for @ref
- * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
- * CLWeightsReshapeKernel.
+   *                                this is described in @ref PadStrideInfo.
+   * @param[in]     invalid_right   The number of zeros added to right edge of the output.
+   * @param[in]     invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info    (Optional) Weights information needed for
+   *                                @ref CLConvolutionLayer, specifies if the weights tensor has
+   *                                been reshaped with @ref CLWeightsReshapeKernel.
    *
    */
   void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
@@ -108,22 +110,24 @@ public:
                  unsigned int invalid_right, unsigned int invalid_bottom,
                  const WeightsInfo &weights_info = WeightsInfo());
   /** Static function to check if given info will lead to a valid configuration of @ref
- * CLTransposeConvLayer
+   * CLTransposeConvLayer
    *
-   * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-   * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data
- * type supported: Same as @p input.
-   * @param[in] bias         (Optional) The biases have one dimension. Data type supported: Same as
- * @p input.
-   * @param[in] output       Output tensor info. The output has the same number of dimensions as the
- * @p input.
-   * @param[in] deconv_info  Contains padding and policies to be used in the deconvolution, this is
- * described in @ref PadStrideInfo.
- * @param[in] invalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
-   * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
- * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   * @param[in] input           Input tensor info. 3 lower dimensions represent a single input,
+   *                            and an optional 4th dimension for batch of inputs.
+   *                            Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in] weights         The 4d weights info with dimensions [width, height, IFM, OFM].
+   *                            Data type supported: Same as @p input.
+   * @param[in] bias            (Optional) The biases have one dimension.
+   *                            Data type supported: Same as @p input.
+   * @param[in] output          Output tensor info. The output has the same number of dimensions
+   *                            as the @p input.
+   * @param[in] deconv_info     Contains padding and policies to be used in the deconvolution,
+   *                            this is described in @ref PadStrideInfo.
+   * @param[in] invalid_right   The number of zeros added to right edge of the output.
+   * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in] weights_info    (Optional) Weights information needed for @ref CLConvolutionLayer,
+   *                            specifies if the weights tensor has been reshaped with
+   *                            @ref CLWeightsReshapeKernel.
    *
    * @return a status
    */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
index 18cb61bf9..e34b4dcb0 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
@@ -43,8 +43,8 @@ public:
 
 public:
   NEFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
-      : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr),
-        _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false)
+    : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr),
+      _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false)
   {
     // DO NOTHING
   }
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
index b2ea6270f..1a68f801a 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
@@ -66,19 +66,20 @@ public:
   void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
                  const ITensor *off_value, ITensor *output, int axis = -1);
   /** Static function to check if given info will lead to a valid configuration of @ref
- * NEOneHotKernel
+   * NEOneHotKernel
    *
- * @param[in]  indices   Indices tensor info. Supported tensor rank: up to 3. Must be one of the
- * following types: U32/S32
- * @param[in]  depth     The tensor info for depth of the one hot dimension. Supported tensor rank:
- * up to 3. Must be one of the following types: U32/S32
- * @param[in]  on_value  On value tensor info. Supported tensor rank: only 1. Data type supported:
- * U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  off_value Off value tensor info. Supported tensor rank: only 1. Data type supported:
- * Same as @p on_value
- * @param[out] output    Destination tensor info. Data type supported: Same as @p on_value
- * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
- * The value must be in range [-indices.rank , indices.rank)
+   * @param[in]  indices   Indices tensor info. Supported tensor rank: up to 3.
+   *                       Must be one of the following types: U32/S32
+   * @param[in]  depth     The tensor info for depth of the one hot dimension.
+   *                       Supported tensor rank: up to 3.
+   *                       Must be one of the following types: U32/S32
+   * @param[in]  on_value  On value tensor info. Supported tensor rank: only 1.
+   *                       Data type supported: U8/S8/U16/S16/F16/U32/S32/F32
+   * @param[in]  off_value Off value tensor info. Supported tensor rank: only 1.
+   *                       Data type supported: Same as @p on_value
+   * @param[out] output    Destination tensor info. Data type supported: Same as @p on_value
+   * @param[in]  axis      (Optional) The axis to fill. Negative values wrap around. Defaults to -1.
+   *                       The value must be in range [-indices.rank , indices.rank)
    *
    * @return a status
    */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
index 24ff5dac9..7a08dae97 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
@@ -110,39 +110,42 @@ public:
 
   /** Set the input, weights, biases and output tensors.
    *
-   * @param[in,out] input   Input tensor. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
-   * @param[in]     weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
-   * @param[in]     bias    Optional, ignored if NULL. The biases have one dimension. Data type
- * supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16
- * for F16 input.
-   * @param[out]    output  Output tensor. The output has the same number of dimensions as the @p
- * input.
-   * @param[in]     info    Contains padding and policies to be used in the deconvolution, this is
- * decribed in @ref PadStrideInfo.
- * @param[in]     invalid_right  The number of zeros added to right edge of the output.
- * @param[in]     invalid_bottom The number of zeros added to bottom edge of the output.
+   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input,
+   *                                and an optional 4th dimension for batch of inputs.
+   *                                Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM].
+   *                                Data type supported: Same as @p input.
+   * @param[in]     bias            Optional, ignored if NULL. The biases have one dimension.
+   *                                Data type supported: Data types supported: S32 for QASYMM8 and
+   * QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
+   * @param[out]    output          Output tensor. The output has the same number of dimensions as
+   *                                the @p input.
+   * @param[in]     info            Contains padding and policies to be used in the deconvolution,
+   *                                this is decribed in @ref PadStrideInfo.
+   * @param[in]     invalid_right   The number of zeros added to right edge of the output.
+   * @param[in]     invalid_bottom  The number of zeros added to bottom edge of the output.
    *
    */
   void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
                  const PadStrideInfo &info, unsigned int invalid_right,
                  unsigned int invalid_bottom);
   /** Static function to check if given info will lead to a valid configuration of @ref
- * NETransposeConvLayer
+   * NETransposeConvLayer
    *
-   * @param[in] input   Input tensor info. 3 lower dimensions represent a single input, and an
- * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
-   * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type
- * supported: Same as @p input.
-   * @param[in] bias    (Optional) The biases have one dimension. Data type supported: Data types
- * supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
-   * @param[in] output  Output tensor info. The output has the same number of dimensions as the @p
- * input.
-   * @param[in] info    Contains padding and policies to be used in the deconvolution, this is
- * decribed in @ref PadStrideInfo.
- * @param[in] innvalid_right  The number of zeros added to right edge of the output.
- * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in] input           Input tensor info. 3 lower dimensions represent a single input,
+   *                            and an optional 4th dimension for batch of inputs.
+   *                            Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+   * @param[in] weights         The 4d weights info with dimensions [width, height, IFM, OFM].
+   *                            Data type supported: Same as @p input.
+   * @param[in] bias            (Optional) The biases have one dimension.
+   *                            Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input,
+   *                                                  F32 for F32 input, F16 for F16 input.
+   * @param[in] output          Output tensor info. The output has the same number of dimensions as
+   *                            the @p input.
+   * @param[in] info            Contains padding and policies to be used in the deconvolution,
+   *                            this is decribed in @ref PadStrideInfo.
+   * @param[in] innvalid_right  The number of zeros added to right edge of the output.
+   * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
    *
    * @return a status
    */
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 81d0cb70f..1a8ff3e71 100644
--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -54,123 +54,123 @@
 using namespace arm_compute;
 
 const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
-    // ARMComputeEx kernels
-    {"arg_min_max_ex_x", "arg_min_max_ex.cl"},
-    {"arg_min_max_ex_y", "arg_min_max_ex.cl"},
-    {"arg_min_max_ex_z", "arg_min_max_ex.cl"},
-    {"arg_min_max_ex_w", "arg_min_max_ex.cl"},
-    {"binary_logical_op", "binary_logical_op.cl"},
-    {"cast_bool", "cast.cl"},
-    {"embedding_lookup", "embedding_lookup.cl"},
-    {"gather_ex", "gather_ex.cl"},
-    {"gather_ex_1d", "gather_ex.cl"},
-    {"gather_ex_1d_out", "gather_ex.cl"},
-    {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"},
-    {"hashtable_lookup", "hashtable_lookup.cl"},
-    {"instance_normalization_ex", "instance_normalization_ex.cl"},
-    {"multiply_scale_factor", "multiply_scale_factor.cl"},
-    {"neg_tensor", "neg_tensor.cl"},
-    {"one_hot", "one_hot.cl"},
-    {"one_hot_only_on_value", "one_hot.cl"},
-    {"quantization_symm8", "quantization_symm8.cl"},
-    {"reduce_min_max", "reduce_operation.cl"},
-    {"reduce_sum_mean", "reduce_operation.cl"},
-    {"topkv2_init", "topkv2.cl"},
-    {"topkv2_find_first_negative", "topkv2.cl"},
-    {"topkv2_reorder_negatives", "topkv2.cl"},
-    {"topkv2_store", "topkv2.cl"},
-    {"radixsort_histogram", "topkv2_radixsort.cl"},
-    {"radixsort_scanhistograms", "topkv2_radixsort.cl"},
-    {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
-    {"radixsort_reorder", "topkv2_radixsort.cl"},
-    {"topkv2_quicksort", "topkv2_quicksort.cl"},
-    {"scale_factor_symm8", "scale_factor.cl"},
+  // ARMComputeEx kernels
+  {"arg_min_max_ex_x", "arg_min_max_ex.cl"},
+  {"arg_min_max_ex_y", "arg_min_max_ex.cl"},
+  {"arg_min_max_ex_z", "arg_min_max_ex.cl"},
+  {"arg_min_max_ex_w", "arg_min_max_ex.cl"},
+  {"binary_logical_op", "binary_logical_op.cl"},
+  {"cast_bool", "cast.cl"},
+  {"embedding_lookup", "embedding_lookup.cl"},
+  {"gather_ex", "gather_ex.cl"},
+  {"gather_ex_1d", "gather_ex.cl"},
+  {"gather_ex_1d_out", "gather_ex.cl"},
+  {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"},
+  {"hashtable_lookup", "hashtable_lookup.cl"},
+  {"instance_normalization_ex", "instance_normalization_ex.cl"},
+  {"multiply_scale_factor", "multiply_scale_factor.cl"},
+  {"neg_tensor", "neg_tensor.cl"},
+  {"one_hot", "one_hot.cl"},
+  {"one_hot_only_on_value", "one_hot.cl"},
+  {"quantization_symm8", "quantization_symm8.cl"},
+  {"reduce_min_max", "reduce_operation.cl"},
+  {"reduce_sum_mean", "reduce_operation.cl"},
+  {"topkv2_init", "topkv2.cl"},
+  {"topkv2_find_first_negative", "topkv2.cl"},
+  {"topkv2_reorder_negatives", "topkv2.cl"},
+  {"topkv2_store", "topkv2.cl"},
+  {"radixsort_histogram", "topkv2_radixsort.cl"},
+  {"radixsort_scanhistograms", "topkv2_radixsort.cl"},
+  {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
+  {"radixsort_reorder", "topkv2_radixsort.cl"},
+  {"topkv2_quicksort", "topkv2_quicksort.cl"},
+  {"scale_factor_symm8", "scale_factor.cl"},
 };
 
 const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
 #ifdef EMBEDDED_KERNELS
-    {
-        "arg_min_max_ex.cl",
+  {
+    "arg_min_max_ex.cl",
 #include "./cl_kernels/arg_min_max_ex.clembed"
-    },
-    {
-        "cast.cl",
+  },
+  {
+    "cast.cl",
 #include "./cl_kernels/cast.clembed"
-    },
-    {
-        "embedding_lookup.cl",
+  },
+  {
+    "embedding_lookup.cl",
 #include "./cl_kernels/embedding_lookup.clembed"
-    },
-    {
-        "gather_ex.cl",
+  },
+  {
+    "gather_ex.cl",
 #include "./cl_kernels/gather_ex.clembed"
-    },
-    {
-        "gemmlowp_ex.cl",
+  },
+  {
+    "gemmlowp_ex.cl",
 #include "./cl_kernels/gemmlowp_ex.clembed"
-    },
-    {
-        "hashtable_lookup.cl",
+  },
+  {
+    "hashtable_lookup.cl",
 #include "./cl_kernels/hashtable_lookup.clembed"
-    },
-    {
-        "helpers.h",
+  },
+  {
+    "helpers.h",
 #include "./cl_kernels/helpers.hembed"
-    },
-    {
-        "helpers_asymm.h",
+  },
+  {
+    "helpers_asymm.h",
 #include "./cl_kernels/helpers_asymm.hembed"
-    },
-    {
-        "instance_normalization_ex.cl",
+  },
+  {
+    "instance_normalization_ex.cl",
 #include "./cl_kernels/instance_normalization_ex.clembed"
-    },
-    {
-        "binary_logical_op.cl",
+  },
+  {
+    "binary_logical_op.cl",
 #include "./cl_kernels/binary_logical_op.clembed"
-    },
-    {
-        "multiply_scale_factor.cl",
+  },
+  {
+    "multiply_scale_factor.cl",
 #include "./cl_kernels/multiply_scale_factor.clembed"
-    },
-    {
-        "neg_tensor.cl",
+  },
+  {
+    "neg_tensor.cl",
 #include "./cl_kernels/neg_tensor.clembed"
-    },
-    {
-        "one_hot.cl",
+  },
+  {
+    "one_hot.cl",
 #include "./cl_kernels/one_hot.clembed"
-    },
-    {
-        "quantization_symm8.cl",
+  },
+  {
+    "quantization_symm8.cl",
 #include "./cl_kernels/quantization_symm8.clembed"
-    },
-    {
-        "reduce_operation.cl",
+  },
+  {
+    "reduce_operation.cl",
 #include "./cl_kernels/reduce_operation.clembed"
-    },
-    {
-        "scale_factor.cl",
+  },
+  {
+    "scale_factor.cl",
 #include "./cl_kernels/scale_factor.clembed"
-    },
-    {
-        "topkv2.cl",
+  },
+  {
+    "topkv2.cl",
 #include "./cl_kernels/topkv2.clembed"
-    },
-    {
-        "topkv2_radixsort.cl",
+  },
+  {
+    "topkv2_radixsort.cl",
 #include "./cl_kernels/topkv2_radixsort.clembed"
-    },
-    {
-        "topkv2_quicksort.cl",
+  },
+  {
+    "topkv2_quicksort.cl",
 #include "./cl_kernels/topkv2_quicksort.clembed"
-    },
+  },
 
 #endif /* EMBEDDED_KERNELS */
 };
 
 CLKernelLibraryEx::CLKernelLibraryEx()
-    : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
+  : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
 {
   opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the
                          // CLKernelLibraryEx is built
@@ -337,8 +337,8 @@ size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) con
 
   size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result);
   ARM_COMPUTE_ERROR_ON_MSG(
-      err != 0,
-      "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
+    err != 0,
+    "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
   ARM_COMPUTE_UNUSED(err);
 
   return result;
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl
index 0a014d15c..135cacf59 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl
@@ -119,15 +119,15 @@ inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x
   in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
   res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
 
-  idx_sel.s0123 = (in.s0123 < in.s4567) ||
-                  (in.s0123 == in.s4567 &&
-                   CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
+  idx_sel.s0123 =
+    (in.s0123 < in.s4567) ||
+    (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
   in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
   res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
 
   idx_sel.s01 =
-      (in.s01 < in.s23) ||
-      (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
+    (in.s01 < in.s23) ||
+    (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
   in.s01 = select(in.s23, in.s01, idx_sel.s01);
   res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
 
@@ -204,15 +204,15 @@ inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x
   in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
   res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
 
-  idx_sel.s0123 = (in.s0123 > in.s4567) ||
-                  (in.s0123 == in.s4567 &&
-                   CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
+  idx_sel.s0123 =
+    (in.s0123 > in.s4567) ||
+    (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
   in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
   res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
 
   idx_sel.s01 =
-      (in.s01 > in.s23) ||
-      (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
+    (in.s01 > in.s23) ||
+    (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
   in.s01 = select(in.s23, in.s01, idx_sel.s01);
   res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
 
@@ -296,22 +296,21 @@ __kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src),
   const uint x_idx = get_global_id(0);
   const uint y_idx = get_global_id(1);
   const __global DATA_TYPE *src_in_row =
-      (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes +
-                                   y_idx * src_step_y);
+    (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y);
 
   for (unsigned int y = 0; y < get_local_size(1); ++y)
   {
 #if defined(ARG_MAX)
 #if defined(PREV_OUTPUT)
-    local_results[lid] = arg_idx_max_prev_out(
-        src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
+    local_results[lid] =
+      arg_idx_max_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
 #else  // !defined(PREV_OUTPUT)
     local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
 #endif // defined(PREV_OUTPUT)
 #else  // defined(ARG_MIN)
 #if defined(PREV_OUTPUT)
-    local_results[lid] = arg_idx_min_prev_out(
-        src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
+    local_results[lid] =
+      arg_idx_min_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
 #else  // !defined(PREV_OUTPUT)
     local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
 #endif // defined(PREV_OUTPUT)
@@ -334,12 +333,12 @@ __kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src),
         DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]);
 #if defined(ARG_MAX)
         condition_check3 =
-            ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1);
+          ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1);
         local_results[lid] = select(local_results[lid], local_results[lid + i], condition_check3);
 #else  // defined(ARG_MIN)
         local_results[lid] = select(
-            local_results[lid], local_results[lid + i],
-            ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
+          local_results[lid], local_results[lid + i],
+          ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
 #endif // defined(ARG_MAX) || defined(ARG_MIN)
       }
       barrier(CLK_LOCAL_MEM_FENCE);
@@ -403,7 +402,7 @@ __kernel void arg_min_max_ex_y(IMAGE_DECLARATION(src), IMAGE_DECLARATION(output)
   {
     VEC_DATA_TYPE(DATA_TYPE, 16)
     in =
-        CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16));
+      CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16));
 
     VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
     cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
index e249663bc..f8b5bbeb8 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl
@@ -111,14 +111,14 @@ __kernel void binary_logical_op(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATI
 #if OP_CODE == 1 // LOGICAL AND
   VSTORE(VEC_SIZE)
   (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) &&
-               VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+             VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
    0, (__global DATA_TYPE *)output.ptr);
 
 #elif OP_CODE == 2 // LOGICAL OR
   VSTORE(VEC_SIZE)
   (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) ||
-               VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
+             VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr),
            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
    0, (__global DATA_TYPE *)output.ptr);
 
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
index 92e5dfbee..5ebc78d23 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl
@@ -117,15 +117,15 @@ __kernel void embedding_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION
   // lookup ids for based on the tensor dimensions
   int lup_id[4] = {0};
 
-  lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0)))
-                              : get_global_id(0);
-  lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1)))
-                              : get_global_id(1);
+  lup_id[0] =
+    (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) : get_global_id(0);
+  lup_id[1] =
+    (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) : get_global_id(1);
   lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2)))
                               : get_global_id(2) % DEPTH_OUT;
   lup_id[3] = (NUM_DIMS == 4)
-                  ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
-                  : get_global_id(2) / DEPTH_OUT;
+                ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+                : get_global_id(2) / DEPTH_OUT;
 
   in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x +
             lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w;
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl
index 80ba73d1d..85fc09de4 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl
@@ -41,7 +41,7 @@
 #include "helpers.h"
 
 #if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && \
-    defined(COLS_A)
+  defined(COLS_A)
 #define VECTOR_CHAR VEC_DATA_TYPE(char, NUM_ELEMS_PROCESSED_PER_THREAD_X)
 #define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
 #define VECTOR_FLOAT VEC_DATA_TYPE(float, NUM_ELEMS_PROCESSED_PER_THREAD_X)
@@ -117,7 +117,7 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(
                                      ,
                                      uint dst_cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
-                                     )
+)
 {
   int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
 
@@ -208,9 +208,9 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
     // Load values from matrix B
     VECTOR_CHAR b0 =
-        VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
+      VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
     VECTOR_CHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(
-        0, (__global char *)(src1_ptr + src_addr.s1 + src1_stride_y));
+      0, (__global char *)(src1_ptr + src_addr.s1 + src1_stride_y));
 
     // Accumulate
     acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0.s0;
@@ -251,7 +251,7 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
     // Load values from matrix B
     VECTOR_CHAR b0 =
-        VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
+      VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1));
 
     // Accumulate
     acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0;
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
index a4f7dbd48..3ace1fde8 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl
@@ -115,15 +115,15 @@ __kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION
 
   int lup_id[4] = {0};
 
-  lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0)))
-                              : get_global_id(0);
-  lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1)))
-                              : get_global_id(1);
+  lup_id[0] =
+    (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) : get_global_id(0);
+  lup_id[1] =
+    (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) : get_global_id(1);
   lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2)))
                               : get_global_id(2) % DEPTH_OUT;
   lup_id[3] = (NUM_DIMS == 4)
-                  ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
-                  : get_global_id(2) / DEPTH_OUT;
+                ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT))
+                : get_global_id(2) / DEPTH_OUT;
 
   if (lup_id[NUM_DIMS - 1] < 0)
   {
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
index e07a25ec9..4a3bc1369 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
@@ -49,7 +49,7 @@
 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 
 #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
-    defined(cl_arm_integer_dot_product_accumulate_int8)
+  defined(cl_arm_integer_dot_product_accumulate_int8)
 #pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
        // defined(cl_arm_integer_dot_product_accumulate_int8)
@@ -288,21 +288,21 @@
 
 #define VECTOR_DECLARATION(name)                                        \
   __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \
-      uint name##_offset_first_element_in_bytes
+    uint name##_offset_first_element_in_bytes
 
 #define IMAGE_DECLARATION(name)                                                               \
   __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
-      uint name##_step_y, uint name##_offset_first_element_in_bytes
+    uint name##_step_y, uint name##_offset_first_element_in_bytes
 
 #define TENSOR3D_DECLARATION(name)                                                            \
   __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
-      uint name##_step_y, uint name##_stride_z, uint name##_step_z,                           \
-      uint name##_offset_first_element_in_bytes
+    uint name##_step_y, uint name##_stride_z, uint name##_step_z,                             \
+    uint name##_offset_first_element_in_bytes
 
 #define TENSOR4D_DECLARATION(name)                                                            \
   __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
-      uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w,     \
-      uint name##_step_w, uint name##_offset_first_element_in_bytes
+    uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w,       \
+    uint name##_step_w, uint name##_offset_first_element_in_bytes
 
 #define CONVERT_TO_VECTOR_STRUCT(name)                                                          \
   update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
@@ -406,9 +406,9 @@ inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_
                                          uint stride_x, uint step_x)
 {
   Vector vector = {
-      .ptr = ptr,
-      .offset_first_element_in_bytes = offset_first_element_in_bytes,
-      .stride_x = stride_x,
+    .ptr = ptr,
+    .offset_first_element_in_bytes = offset_first_element_in_bytes,
+    .stride_x = stride_x,
   };
   vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
   return vector;
@@ -436,7 +436,7 @@ inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el
                .stride_x = stride_x,
                .stride_y = stride_y};
   img.ptr +=
-      img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
   return img;
 }
 
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
index 5f1b3f902..d7f1d0814 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
@@ -100,16 +100,16 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return quantized values
  */
-#define QUANTIZE_IMPL(type, size)                                                                 \
-  inline VEC_DATA_TYPE(type, size)                                                                \
-      quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale)          \
-  {                                                                                               \
-    VEC_DATA_TYPE(float, size)                                                                    \
-    out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \
-    VEC_DATA_TYPE(type, size)                                                                     \
-    res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)),                        \
-                      VEC_DATA_TYPE(type, size));                                                 \
-    return res;                                                                                   \
+#define QUANTIZE_IMPL(type, size)                                                                  \
+  inline VEC_DATA_TYPE(type, size)                                                                 \
+    quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale)             \
+  {                                                                                                \
+    VEC_DATA_TYPE(float, size)                                                                     \
+    out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);  \
+    VEC_DATA_TYPE(type, size)                                                                      \
+    res =                                                                                          \
+      CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \
+    return res;                                                                                    \
   }
 
 /** Dequantize a vector of values to floating-point
@@ -119,11 +119,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return dequantized values in floating point
  */
-#define DEQUANTIZE_IMPL(type, size)                                                       \
-  inline VEC_DATA_TYPE(float, size)                                                       \
-      dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
-  {                                                                                       \
-    return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                 \
+#define DEQUANTIZE_IMPL(type, size)                                                     \
+  inline VEC_DATA_TYPE(float, size)                                                     \
+    dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+  {                                                                                     \
+    return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;               \
   }
 
 /** Correctly-rounded-to-nearest division by a power-of-two.
@@ -134,7 +134,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  */
 #define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                        \
   inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \
-      VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent)    \
+    VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent)      \
   {                                                                     \
     const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0;  \
     const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1;   \
@@ -152,32 +152,32 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Product of two fixed-point numbers.
  */
-#define ASYMM_MULT_IMPL(size)                                                  \
-  inline VEC_DATA_TYPE(int, size)                                              \
-      asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
-  {                                                                            \
-    VEC_DATA_TYPE(int, size)                                                   \
-    overflow = a == b && a == INT_MIN;                                         \
-    VEC_DATA_TYPE(long, size)                                                  \
-    a_64 = convert_long##size(a);                                              \
-    VEC_DATA_TYPE(long, size)                                                  \
-    b_64 = convert_long##size(b);                                              \
-    VEC_DATA_TYPE(long, size)                                                  \
-    ab_64 = a_64 * b_64;                                                       \
-    /* Revert COMPMID-907 */                                                   \
-    VEC_DATA_TYPE(long, size)                                                  \
-    mask1 = 1 << 30;                                                           \
-    VEC_DATA_TYPE(long, size)                                                  \
-    mask2 = 1 - (1 << 30);                                                     \
-    VEC_DATA_TYPE(long, size)                                                  \
-    is_positive_or_zero = ab_64 >= 0;                                          \
-    VEC_DATA_TYPE(long, size)                                                  \
-    nudge = select(mask2, mask1, is_positive_or_zero);                         \
-    VEC_DATA_TYPE(long, size)                                                  \
-    mask = 1ll << 31;                                                          \
-    VEC_DATA_TYPE(int, size)                                                   \
-    ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                  \
-    return select(ab_x2_high32, INT_MAX, overflow);                            \
+#define ASYMM_MULT_IMPL(size)                                                \
+  inline VEC_DATA_TYPE(int, size)                                            \
+    asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+  {                                                                          \
+    VEC_DATA_TYPE(int, size)                                                 \
+    overflow = a == b && a == INT_MIN;                                       \
+    VEC_DATA_TYPE(long, size)                                                \
+    a_64 = convert_long##size(a);                                            \
+    VEC_DATA_TYPE(long, size)                                                \
+    b_64 = convert_long##size(b);                                            \
+    VEC_DATA_TYPE(long, size)                                                \
+    ab_64 = a_64 * b_64;                                                     \
+    /* Revert COMPMID-907 */                                                 \
+    VEC_DATA_TYPE(long, size)                                                \
+    mask1 = 1 << 30;                                                         \
+    VEC_DATA_TYPE(long, size)                                                \
+    mask2 = 1 - (1 << 30);                                                   \
+    VEC_DATA_TYPE(long, size)                                                \
+    is_positive_or_zero = ab_64 >= 0;                                        \
+    VEC_DATA_TYPE(long, size)                                                \
+    nudge = select(mask2, mask1, is_positive_or_zero);                       \
+    VEC_DATA_TYPE(long, size)                                                \
+    mask = 1ll << 31;                                                        \
+    VEC_DATA_TYPE(int, size)                                                 \
+    ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                \
+    return select(ab_x2_high32, INT_MAX, overflow);                          \
   }
 
 /** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
@@ -186,32 +186,32 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Result in fixed-point format Q0.
  */
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                   \
-  inline VEC_DATA_TYPE(int, size)                                                                  \
-      asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \
-                                                                              a)                   \
-  {                                                                                                \
-    const VEC_DATA_TYPE(int, size) constant_term = 1895147668;                                     \
-    const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                  \
-    const int k_fractional_bits = 31;                                                              \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x = a + (1 << (k_fractional_bits - 3));                                                        \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x2 = ASYMM_MULT(x, x, size);                                                                   \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x3 = ASYMM_MULT(x2, x, size);                                                                  \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x4 = ASYMM_MULT(x2, x2, size);                                                                 \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                        \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x4_over_24_plus_x3_over_6_plus_x2 =                                                            \
-        ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                                \
-    VEC_DATA_TYPE(int, size)                                                                       \
-    x4_over_24_plus_x3_over_6_plus_x2_over_2 =                                                     \
-        ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);                 \
-    return constant_term +                                                                         \
-           ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);          \
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                 \
+  inline VEC_DATA_TYPE(int, size)                                                                \
+    asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \
+                                                                          a)                     \
+  {                                                                                              \
+    const VEC_DATA_TYPE(int, size) constant_term = 1895147668;                                   \
+    const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                \
+    const int k_fractional_bits = 31;                                                            \
+    VEC_DATA_TYPE(int, size)                                                                     \
+    x = a + (1 << (k_fractional_bits - 3));                                                      \
+    VEC_DATA_TYPE(int, size)                                                                     \
+    x2 = ASYMM_MULT(x, x, size);                                                                 \
+    VEC_DATA_TYPE(int, size)                                                                     \
+    x3 = ASYMM_MULT(x2, x, size);                                                                \
+    VEC_DATA_TYPE(int, size)                                                                     \
+    x4 = ASYMM_MULT(x2, x2, size);                                                               \
+    VEC_DATA_TYPE(int, size)                                                                     \
+    x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                      \
+    VEC_DATA_TYPE(int, size)                                                                     \
+    x4_over_24_plus_x3_over_6_plus_x2 =                                                          \
+      ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                                \
+    VEC_DATA_TYPE(int, size)                                                                     \
+    x4_over_24_plus_x3_over_6_plus_x2_over_2 =                                                   \
+      ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);                 \
+    return constant_term +                                                                       \
+           ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);        \
   }
 
 /** Each bit of the result is set to the corresponding bit of either then_val or
@@ -263,15 +263,15 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
 
 #define EXP_BARREL_SHIFTER_IMPL(size)                                                          \
   inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(                                    \
-      VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits,    \
-      int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder)                               \
+    VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits,      \
+    int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder)                                 \
   {                                                                                            \
     if (k_integer_bits > exponent)                                                             \
     {                                                                                          \
       const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \
       return ASYMM_SELECT_USING_MASK(                                                          \
-          ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                     \
-          ASYMM_MULT(result, fp_multiplier, size), result, size);                              \
+        ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                       \
+        ASYMM_MULT(result, fp_multiplier, size), result, size);                                \
     }                                                                                          \
                                                                                                \
     return result;                                                                             \
@@ -285,7 +285,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  */
 #define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                   \
   inline VEC_DATA_TYPE(int, size)                                                                 \
-      asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)          \
+    asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)            \
   {                                                                                               \
     const int k_fractional_bits = 31 - k_integer_bits;                                            \
     VEC_DATA_TYPE(int, size)                                                                      \
@@ -298,7 +298,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
     a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;   \
     VEC_DATA_TYPE(int, size)                                                                      \
     result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(                       \
-        a_mod_quarter_minus_one_quarter_scaled, size);                                            \
+      a_mod_quarter_minus_one_quarter_scaled, size);                                              \
     VEC_DATA_TYPE(int, size)                                                                      \
     remainder = a_mod_quarter_minus_one_quarter - a;                                              \
                                                                                                   \
@@ -312,10 +312,10 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
                                 remainder, size);                                                 \
     result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits,          \
                                 remainder, size);                                                 \
-    result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \
-                                size);                                                            \
     result =                                                                                      \
-        EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);  \
+      EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \
+    result =                                                                                      \
+      EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);    \
                                                                                                   \
     if (k_integer_bits > 5)                                                                       \
     {                                                                                             \
@@ -335,27 +335,27 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Arithmetic left or right shift.
  */
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                    \
-  inline VEC_DATA_TYPE(int, size)                                                            \
-      asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
-  {                                                                                          \
-    if (exponent < 0)                                                                        \
-    {                                                                                        \
-      return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                              \
-    }                                                                                        \
-                                                                                             \
-    const VEC_DATA_TYPE(int, size) min = INT_MIN;                                            \
-    const VEC_DATA_TYPE(int, size) max = INT_MAX;                                            \
-    int threshold = ((1 << (31 - exponent)) - 1);                                            \
-    VEC_DATA_TYPE(int, size)                                                                 \
-    positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                             \
-    VEC_DATA_TYPE(int, size)                                                                 \
-    negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                            \
-    VEC_DATA_TYPE(int, size)                                                                 \
-    result = x << exponent;                                                                  \
-    result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                      \
-    result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                      \
-    return result;                                                                           \
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                  \
+  inline VEC_DATA_TYPE(int, size)                                                          \
+    asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+  {                                                                                        \
+    if (exponent < 0)                                                                      \
+    {                                                                                      \
+      return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                            \
+    }                                                                                      \
+                                                                                           \
+    const VEC_DATA_TYPE(int, size) min = INT_MIN;                                          \
+    const VEC_DATA_TYPE(int, size) max = INT_MAX;                                          \
+    int threshold = ((1 << (31 - exponent)) - 1);                                          \
+    VEC_DATA_TYPE(int, size)                                                               \
+    positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                           \
+    VEC_DATA_TYPE(int, size)                                                               \
+    negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                          \
+    VEC_DATA_TYPE(int, size)                                                               \
+    result = x << exponent;                                                                \
+    result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                    \
+    result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                    \
+    return result;                                                                         \
   }
 
 /** Calculates (a+b)/2, rounded to the nearest integer.
@@ -365,21 +365,21 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return (a+b)/2, rounded to the nearest integer.
  */
-#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                  \
-  inline VEC_DATA_TYPE(int, size)                                                           \
-      asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
-  {                                                                                         \
-    VEC_DATA_TYPE(long, size)                                                               \
-    a64 = convert_long##size(a);                                                            \
-    VEC_DATA_TYPE(long, size)                                                               \
-    b64 = convert_long##size(b);                                                            \
-    VEC_DATA_TYPE(long, size)                                                               \
-    sum = a64 + b64;                                                                        \
-    const VEC_DATA_TYPE(long, size) one = 1;                                                \
-    const VEC_DATA_TYPE(long, size) minus_one = -1;                                         \
-    VEC_DATA_TYPE(long, size)                                                               \
-    sign = select(minus_one, one, sum >= 0);                                                \
-    return convert_int##size((sum + sign) / 2);                                             \
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                \
+  inline VEC_DATA_TYPE(int, size)                                                         \
+    asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+  {                                                                                       \
+    VEC_DATA_TYPE(long, size)                                                             \
+    a64 = convert_long##size(a);                                                          \
+    VEC_DATA_TYPE(long, size)                                                             \
+    b64 = convert_long##size(b);                                                          \
+    VEC_DATA_TYPE(long, size)                                                             \
+    sum = a64 + b64;                                                                      \
+    const VEC_DATA_TYPE(long, size) one = 1;                                              \
+    const VEC_DATA_TYPE(long, size) minus_one = -1;                                       \
+    VEC_DATA_TYPE(long, size)                                                             \
+    sign = select(minus_one, one, sum >= 0);                                              \
+    return convert_int##size((sum + sign) / 2);                                           \
   }
 
 /** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
@@ -390,7 +390,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  */
 #define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                      \
   inline VEC_DATA_TYPE(int, size)                                              \
-      asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+    asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a)   \
   {                                                                            \
     const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                           \
     const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                     \
@@ -462,14 +462,14 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
 #define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
   asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
 
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                               \
-  inline VEC_DATA_TYPE(int, size)                                                                 \
-      multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
-  {                                                                                               \
-    const int left_shift = shift > 0 ? shift : 0;                                                 \
-    const int right_shift = shift > 0 ? 0 : -shift;                                               \
-    return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size),       \
-                                         right_shift, size);                                      \
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                             \
+  inline VEC_DATA_TYPE(int, size)                                                               \
+    multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
+  {                                                                                             \
+    const int left_shift = shift > 0 ? shift : 0;                                               \
+    const int right_shift = shift > 0 ? 0 : -shift;                                             \
+    return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size),     \
+                                         right_shift, size);                                    \
   }
 #define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
   multiply_by_quantized_multiplier##size(input, qmul, shift)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
index 014842680..96a243110 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl
@@ -41,7 +41,7 @@
 #include "helpers.h"
 
 #if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \
-    defined(DIM_Y) && defined(DIM_Z)
+  defined(DIM_Y) && defined(DIM_Z)
 /** This function normalizes the input 2D tensor across the first dimension with respect to mean and
  * standard deviation of the same dimension.
  *
@@ -108,14 +108,14 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
                                         TENSOR4D_DECLARATION(output)
 #endif /* IN_PLACE */
 #ifdef GAMMA
-                                            ,
+                                          ,
                                         VECTOR_DECLARATION(gamma)
 #endif // GAMMA
 #ifdef BETA
-                                            ,
+                                          ,
                                         VECTOR_DECLARATION(beta)
 #endif // BETA
-                                            )
+)
 {
   Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
 #ifndef IN_PLACE
@@ -213,12 +213,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
     for (int i_h = 0; i_h < DIM_Z; ++i_h)
     {
       __global DATA_TYPE *input_address =
-          (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
+        (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
 #ifdef IN_PLACE
       __global DATA_TYPE *output_address = input_address;
 #else  /* !IN_PLACE */
       __global DATA_TYPE *output_address =
-          (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
+        (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
 #endif /* IN_PLACE */
       *(output_address) = (*(input_address)-mean) * multip + beta;
     }
@@ -231,12 +231,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
     for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
     {
       __global DATA_TYPE *input_address =
-          (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+        (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
 #ifdef IN_PLACE
       __global DATA_TYPE *output_address = input_address;
 #else  /* !IN_PLACE */
       __global DATA_TYPE *output_address =
-          (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+        (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
 #endif /* IN_PLACE */
 
       VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
@@ -251,12 +251,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input),
     for (; x < DIM_X; ++x)
     {
       __global DATA_TYPE *input_address =
-          (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+        (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
 #ifdef IN_PLACE
       __global DATA_TYPE *output_address = input_address;
 #else  /* !IN_PLACE */
       __global DATA_TYPE *output_address =
-          (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+        (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
 #endif /* IN_PLACE */
       *(output_address) = (*(input_address)-mean) * multip + beta;
     }
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl
index 3943fc4c2..abbfbd275 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl
@@ -114,8 +114,8 @@ __kernel void multiply_scale_factor(IMAGE_DECLARATION(input), VECTOR_DECLARATION
   (val, 0, (__global DATA_TYPE *)output.ptr);
 #else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
   *((__global DATA_TYPE *)(output.ptr)) =
-      ((DATA_TYPE)(*((__global int *)(input.ptr)))) *
-      *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)) * (DATA_TYPE)(multiplier);
+    ((DATA_TYPE)(*((__global int *)(input.ptr)))) *
+    *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)) * (DATA_TYPE)(multiplier);
 #endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
 }
 
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl
index c274aba62..784a8d6aa 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl
@@ -206,16 +206,16 @@ __kernel void one_hot_only_on_value(TENSOR3D_DECLARATION(indices), VECTOR_DECLAR
 
 #if AXIS == 0
   *(__global DATA_TYPE *)tensor4D_offset(&output, index, px, py, pz) =
-      *((__global const DATA_TYPE *)on_value_ptr);
+    *((__global const DATA_TYPE *)on_value_ptr);
 #elif AXIS == 1
   *(__global DATA_TYPE *)tensor4D_offset(&output, px, index, py, pz) =
-      *((__global const DATA_TYPE *)on_value_ptr);
+    *((__global const DATA_TYPE *)on_value_ptr);
 #elif AXIS == 2
   *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, index, pz) =
-      *((__global const DATA_TYPE *)on_value_ptr);
+    *((__global const DATA_TYPE *)on_value_ptr);
 #elif AXIS == 3
   *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, pz, index) =
-      *((__global const DATA_TYPE *)on_value_ptr);
+    *((__global const DATA_TYPE *)on_value_ptr);
 #endif // AXIS
 }
 
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
index 76fda9041..532000e9e 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl
@@ -138,7 +138,7 @@ __kernel void pixelwise_mul_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARAT
 
   // Multiply with a multiplier smaller than 1
   out_val =
-      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
+    ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
   out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
 
   VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl
index 4ae9adb0b..c829f264d 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl
@@ -116,7 +116,7 @@ __kernel void quantization_symm8(IMAGE_DECLARATION(input), VECTOR_DECLARATION(sc
 
   // Create scale vector
   const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale =
-      *(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1));
+    *(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1));
 
   // Quantize
   VEC_DATA_TYPE(int, VEC_SIZE)
@@ -127,10 +127,10 @@ __kernel void quantization_symm8(IMAGE_DECLARATION(input), VECTOR_DECLARATION(sc
   (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
 #else  //! defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
   *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP(
-      CONVERT_RTE((*(__global DATA_TYPE_IN *)input.ptr) /
-                      (*(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1))),
-                  int),
-      MIN_QUANT_VAL, MAX_QUANT_VAL);
+    CONVERT_RTE((*(__global DATA_TYPE_IN *)input.ptr) /
+                  (*(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1))),
+                int),
+    MIN_QUANT_VAL, MAX_QUANT_VAL);
 #endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
 }
 #endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
index 832ac1270..d0ef31b20 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
@@ -100,12 +100,14 @@ __kernel void reduce_min_max(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(o
   Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
 
   int indices[4] = {
-      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
-      get_global_id(2) / DEPTH_OUT,
+    get_global_id(0),
+    get_global_id(1),
+    get_global_id(2) % DEPTH_OUT,
+    get_global_id(2) / DEPTH_OUT,
   };
 
   DATA_TYPE value =
-      *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+    *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
   for (int i = 1; i < dim; ++i)
   {
     indices[axis] = i;
@@ -186,16 +188,18 @@ __kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(
   Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
 
   int indices[4] = {
-      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
-      get_global_id(2) / DEPTH_OUT,
+    get_global_id(0),
+    get_global_id(1),
+    get_global_id(2) % DEPTH_OUT,
+    get_global_id(2) / DEPTH_OUT,
   };
 
   DATA_TYPE sum_value = (DATA_TYPE)0;
   for (int i = 0; i < dim; ++i)
   {
     indices[axis] = i;
-    sum_value += *(
-        (__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+    sum_value +=
+      *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
   }
 
 #if OP_CODE == 3 // REDUCE_SUM
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
index 047004d5e..45307fad7 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
@@ -63,10 +63,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_outp
 {
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
   ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32,
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8,
+                                                       DataType::QASYMM8_SIGNED, DataType::S32,
                                                        DataType::F16, DataType::F32);
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX &&
-                                      op != ReductionOperation::ARG_IDX_MIN,
+                                    op != ReductionOperation::ARG_IDX_MIN,
                                   "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
                                   "Reduction axis greater than max number of dimensions");
@@ -101,13 +102,13 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input,
   output_shape.set(axis, 1);
   DataType output_data_type = (prev_output != nullptr) ? (prev_output->data_type()) : DataType::S32;
   auto_init_if_empty(*output, input->clone()
-                                  ->set_tensor_shape(output_shape)
-                                  .set_data_type(output_data_type)
-                                  .reset_padding()
-                                  .set_is_resizable(true));
+                                ->set_tensor_shape(output_shape)
+                                .set_data_type(output_data_type)
+                                .reset_padding()
+                                .set_is_resizable(true));
 
-  Window win = calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input),
-                                    Steps(vector_size));
+  Window win =
+    calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input), Steps(vector_size));
   bool window_changed = false;
 
   switch (axis)
@@ -137,15 +138,15 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input,
   }
 
   Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
+                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                 : Status{};
   return std::make_tuple(err, win);
 }
 } // namespace
 
 CLArgMinMaxLayerKernelEx::CLArgMinMaxLayerKernelEx()
-    : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0),
-      _op(ReductionOperation::ARG_IDX_MAX)
+  : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0),
+    _op(ReductionOperation::ARG_IDX_MAX)
 {
 }
 
@@ -155,11 +156,11 @@ void CLArgMinMaxLayerKernelEx::configure(const ICLTensor *input, const ICLTensor
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr,
-                         output->info(), axis, op));
+    validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr,
+                       output->info(), axis, op));
   auto win_config = validate_and_configure_window(
-      input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis,
-      op);
+    input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis,
+    op);
   ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
   _input = input;
@@ -213,7 +214,7 @@ void CLArgMinMaxLayerKernelEx::configure(const ICLTensor *input, const ICLTensor
       ARM_COMPUTE_ERROR("Not supported");
   }
   _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
-      "arg_min_max_ex_" + kernel_axis_name, build_opts.options()));
+    "arg_min_max_ex_" + kernel_axis_name, build_opts.options()));
 
   // Configure kernel window
   ICLKernel::configure_internal(std::get<1>(win_config), lws_hint);
@@ -225,8 +226,8 @@ Status CLArgMinMaxLayerKernelEx::validate(const ITensorInfo *input, const ITenso
 {
   ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, prev_output, output, axis, op));
   ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
-      input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr,
-      output->clone().get(), axis, op)));
+    input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr,
+    output->clone().get(), axis, op)));
   return Status{};
 }
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
index fbc76f5e1..ffa2c5a67 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -55,7 +55,7 @@ Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
                            const ITensorInfo *output)
 {
   const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+    TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
 
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8);
@@ -68,15 +68,15 @@ Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2,
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8,
                                                          DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
+      detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
+      "Wrong shape for output");
   }
   return Status{};
 }
 } // namespace
 
 CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+  : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
 }
 
@@ -111,13 +111,13 @@ void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor
 
   build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code)));
   build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
 
   _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+    static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
 
   const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+    ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
 
   const ValidRegion &valid_region = broadcast_pair.second;
 
@@ -130,8 +130,8 @@ void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor
   AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
   update_window_and_padding(win_input1, input1_access) ||
-      update_window_and_padding(win_input2, input2_access) ||
-      update_window_and_padding(win, output_access);
+    update_window_and_padding(win_input2, input2_access) ||
+    update_window_and_padding(win, output_access);
 
   output_access.set_valid_region(win, valid_region);
 
@@ -151,7 +151,7 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
   if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
   {
     can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+      (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
     for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
     {
       can_collapse = (in_shape1[d] == in_shape2[d]);
@@ -160,13 +160,13 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
 
   bool has_collapsed = false;
   Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
+    can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
+                 : window;
 
   const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+    has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
   const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+    has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
 
   Window slice = collapsed.first_slice_window_3D();
   Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
@@ -189,9 +189,9 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue)
 BorderSize CLBinaryLogicalOpKernel::border_size() const
 {
   const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+    _output->info()->dimension(0) -
+    std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
   const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+    std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
   return BorderSize(0, border, 0, 0);
 }
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
index 6e0bcde7f..3f2ae357d 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
@@ -103,7 +103,7 @@ void CLCastBoolKernel::configure(const ICLTensor *input, ICLTensor *output)
   // Create kernel
   const std::string kernel_name = "cast_bool";
   _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
+    CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
 
   // Configure kernel
   ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
index 67aaf2db6..e4c617c8d 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -61,14 +61,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
   input_access.set_valid_region(win, output->valid_region());
 
   Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
+                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                 : Status{};
   return std::make_pair(err, win);
 }
 } // namespace
 
 CLEmbeddingLookupKernel::CLEmbeddingLookupKernel()
-    : _input(nullptr), _output(nullptr), _lookups(nullptr)
+  : _input(nullptr), _output(nullptr), _lookups(nullptr)
 {
 }
 
@@ -77,8 +77,8 @@ Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensor
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+    DataType::U32, DataType::S32, DataType::F16, DataType::F32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
   ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
@@ -108,8 +108,8 @@ void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *outpu
   build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
 
   // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+  _kernel =
+    static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
 
   // Configure kernel window
   auto win_config = validate_and_configure_window(input->info(), output->info());
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
index 3bfe3e407..8b5885225 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
@@ -62,15 +62,15 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in
   ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+    DataType::U32, DataType::S32, DataType::F16, DataType::F32);
 
   if (output->total_size() != 0)
   {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
-        input->tensor_shape(), indices->tensor_shape(), actual_axis);
+      input->tensor_shape(), indices->tensor_shape(), actual_axis);
     ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
   }
 
@@ -86,7 +86,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
   const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
   std::unique_ptr<ITensorInfo> output_info = input->clone();
   output_info->set_tensor_shape(arm_compute::misc::shape_calculator::compute_gather_shape_ex(
-      input->tensor_shape(), indices->tensor_shape(), actual_axis));
+    input->tensor_shape(), indices->tensor_shape(), actual_axis));
   // Output auto initialization if not yet initialized
   auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type());
 
@@ -100,7 +100,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 } // namespace
 
 CLGatherExKernel::CLGatherExKernel()
-    : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
+  : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
 {
 }
 
@@ -109,11 +109,11 @@ void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indice
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), indices->info(), output->info(), axis));
+    validate_arguments(input->info(), indices->info(), output->info(), axis));
 
   // Configure kernel window
   auto win_config =
-      validate_and_configure_window(input->info(), indices->info(), output->info(), axis);
+    validate_and_configure_window(input->info(), indices->info(), output->info(), axis);
   ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
   _input = input;
@@ -133,7 +133,7 @@ void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indice
 
   // Create kernel
   _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options()));
+    CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options()));
   ICLKernel::configure_internal(win_config.second);
 }
 
@@ -144,7 +144,7 @@ Status CLGatherExKernel::validate(const ITensorInfo *input, const ITensorInfo *i
   ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
                                                             indices->clone().get(),
                                                             output->clone().get(), axis)
-                                  .first);
+                                .first);
   return Status{};
 }
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
index 930e7c944..f0a761b97 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -61,8 +61,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
   input_access.set_valid_region(win, output->valid_region());
 
   Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
+                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                 : Status{};
   return std::make_pair(err, win);
 }
 } // namespace
@@ -78,8 +78,8 @@ Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITens
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+    DataType::U32, DataType::S32, DataType::F16, DataType::F32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8);
@@ -102,7 +102,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
                                         const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
 {
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
+    validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
 
   _lookups = lookups;
   _keys = keys;
@@ -113,7 +113,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
   // Make _lookup_indices tensor
   _lookup_indices = support::cpp14::make_unique<CLTensor>();
   _lookup_indices->allocator()->init(
-      TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
+    TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
   _lookup_indices->allocator()->allocate();
 
   // Set kernel build options
@@ -127,8 +127,8 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
   build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions()));
 
   // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
+  _kernel =
+    static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts));
 
   // Configure kernel window
   auto win_config = validate_and_configure_window(input->info(), output->info());
@@ -148,7 +148,7 @@ void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue)
 
   // Set values of hits
   const int32_t *lookups_buf =
-      reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
+    reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer());
   const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer());
   uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer());
   int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer());
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
index 61c14d271..dab6480b2 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
@@ -94,8 +94,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
 } // namespace
 
 CLInstanceNormalizationLayerKernelEx::CLInstanceNormalizationLayerKernelEx()
-    : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12),
-      _run_in_place(false)
+  : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12),
+    _run_in_place(false)
 {
 }
 
@@ -132,7 +132,7 @@ void CLInstanceNormalizationLayerKernelEx::configure(ICLTensor *input, ICLTensor
 
   // Create kernel
   _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options()));
+    CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options()));
 
   // Configure kernel window
   auto win_config = validate_and_configure_window(_input->info(), _output->info());
@@ -147,7 +147,7 @@ Status CLInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input,
 {
   ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
   ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
-      input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+    input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
   return Status{};
 }
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
index 6b27c9917..1d4b141a7 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
@@ -99,7 +99,7 @@ std::tuple<Status, Window> validate_and_configure_window(const ITensorInfo *inpu
 } // namespace
 
 CLMultiplyScaleFactorKernel::CLMultiplyScaleFactorKernel()
-    : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
+  : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
 {
 }
 
@@ -108,7 +108,7 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), scale_factor->info(), output->info()));
+    validate_arguments(input->info(), scale_factor->info(), output->info()));
 
   _input = input;
   _scale_factor = scale_factor;
@@ -123,9 +123,9 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen
   Window win = calculate_max_window(*output->info());
   if (multi_access_x)
   {
-    win.set(Window::DimX,
-            Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x),
-                              vec_size_x));
+    win.set(
+      Window::DimX,
+      Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
   }
   ICLKernel::configure_internal(win);
 
@@ -134,11 +134,11 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen
   build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
   build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
   build_opts.add_option_if(
-      multi_access_x, "-DLAST_ACCESSED_X=" +
-                          support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    multi_access_x, "-DLAST_ACCESSED_X=" +
+                      support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
 
   _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("multiply_scale_factor", build_opts.options()));
+    CLKernelLibraryEx::get().create_kernel("multiply_scale_factor", build_opts.options()));
 }
 
 Status CLMultiplyScaleFactorKernel::validate(const ITensorInfo *input,
@@ -147,7 +147,7 @@ Status CLMultiplyScaleFactorKernel::validate(const ITensorInfo *input,
 {
   ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output));
   ARM_COMPUTE_RETURN_ON_ERROR(
-      std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+    std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
   return Status{};
 }
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
index 643c8b110..ee633d437 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -80,9 +80,9 @@ void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output)
   std::set<std::string> build_opts;
   build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
   build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
   _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
+    static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts));
 
   // Configure window
   Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
index 35d70d689..0b8e7cc41 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
@@ -65,7 +65,7 @@ inline Status validate_arguments(const ITensorInfo *indices, const ITensorInfo *
   {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output);
     TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex(
-        indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis);
+      indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis);
     ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
   }
   return Status{};
@@ -79,7 +79,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *indices,
   const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions()));
   // Output auto initialization if not yet initialized
   TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex(
-      indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis);
+    indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis);
   auto_init_if_empty((*output), output_shape, 1, on_value->data_type());
   // Create window
   Window win = calculate_max_window(*output, Steps());
@@ -88,8 +88,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *indices,
 }
 } // namespace
 CLOneHotKernel::CLOneHotKernel()
-    : _indices(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr),
-      _is_off_value_memset(false)
+  : _indices(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr),
+    _is_off_value_memset(false)
 {
 }
 void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value,
@@ -114,10 +114,10 @@ void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor
                                       ICLTensor *output, int depth, int axis)
 {
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(indices->info(), on_value->info(), output->info(), depth, axis));
+    validate_arguments(indices->info(), on_value->info(), output->info(), depth, axis));
   // Configure kernel window
   auto win_config =
-      validate_and_configure_window(indices->info(), on_value->info(), output->info(), depth, axis);
+    validate_and_configure_window(indices->info(), on_value->info(), output->info(), depth, axis);
   ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
   if (_is_off_value_memset)
   {
@@ -131,7 +131,7 @@ void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor
   // Set build options
   CLBuildOptions build_opts;
   build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(
-                                             data_size_from_type(on_value->info()->data_type())));
+                                           data_size_from_type(on_value->info()->data_type())));
   build_opts.add_option("-DAXIS=" + support::cpp11::to_string(actual_axis));
   build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(depth));
   build_opts.add_option("-DOUTPUT_DIM_Z=" +
@@ -139,7 +139,7 @@ void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor
   // Create kernel
   const std::string kernel_name = _is_off_value_memset ? "one_hot_only_on_value" : "one_hot";
   _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
+    CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
   ICLKernel::configure_internal(win_config.second);
 }
 Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value,
@@ -153,7 +153,7 @@ Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *o
   ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(),
                                                             on_value->clone().get(),
                                                             output->clone().get(), depth, axis)
-                                  .first);
+                                .first);
   return Status{};
 }
 Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value,
@@ -163,7 +163,7 @@ Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *o
   ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(),
                                                             on_value->clone().get(),
                                                             output->clone().get(), depth, axis)
-                                  .first);
+                                .first);
   return Status{};
 }
 void CLOneHotKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
index 1a7a18cfa..b417a7103 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
@@ -87,9 +87,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 
   if (multi_access_x)
   {
-    win.set(Window::DimX,
-            Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x),
-                              vec_size_x));
+    win.set(
+      Window::DimX,
+      Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
   }
 
   Coordinates coord;
@@ -101,7 +101,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 } // namespace
 
 CLQuantizationSymmetricKernel::CLQuantizationSymmetricKernel()
-    : _input(nullptr), _scale_factor(nullptr), _output(nullptr)
+  : _input(nullptr), _scale_factor(nullptr), _output(nullptr)
 {
 }
 
@@ -110,7 +110,7 @@ void CLQuantizationSymmetricKernel::configure(const ICLTensor *input, const ICLT
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, scale_factor, output);
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), scale_factor->info(), output->info()));
+    validate_arguments(input->info(), scale_factor->info(), output->info()));
 
   _input = input;
   _scale_factor = scale_factor;
@@ -132,11 +132,11 @@ void CLQuantizationSymmetricKernel::configure(const ICLTensor *input, const ICLT
   build_opts.add_option("-DDATA_TYPE_OUT=" +
                         get_cl_type_from_data_type(output->info()->data_type()));
   build_opts.add_option_if(
-      multi_access_x, "-DLAST_ACCESSED_X=" +
-                          support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
+    multi_access_x,
+    "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
 
   _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("quantization_symm8", build_opts.options()));
+    CLKernelLibraryEx::get().create_kernel("quantization_symm8", build_opts.options()));
 }
 
 Status CLQuantizationSymmetricKernel::validate(const ITensorInfo *input,
@@ -145,7 +145,7 @@ Status CLQuantizationSymmetricKernel::validate(const ITensorInfo *input,
 {
   ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output));
   ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+    validate_and_configure_window(input->clone().get(), output->clone().get()).first);
 
   return Status{};
 }
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
index 3fbebf25a..3906009c2 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -145,7 +145,7 @@ void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *outpu
 
   // Create kernel
   _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+    static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
 
   // Configure  kernel window
   Window win = calculate_max_window(*output_info, Steps());
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
index 8d8853c81..4a6374444 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
@@ -94,8 +94,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
   output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
 
   Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
+                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                 : Status{};
   return std::make_tuple(err, win);
 }
 } // namespace
@@ -115,7 +115,7 @@ void CLScaleFactorSymm8Kernel::configure(const ICLTensor *input, ICLTensor *outp
 
   // Create kernel
   _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel("scale_factor_symm8", build_opts));
+    CLKernelLibraryEx::get().create_kernel("scale_factor_symm8", build_opts));
 
   auto win_config = validate_and_configure_window(input->info(), output->info());
 
@@ -128,7 +128,7 @@ Status CLScaleFactorSymm8Kernel::validate(const ITensorInfo *input, const ITenso
 {
   ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
   ARM_COMPUTE_RETURN_ON_ERROR(
-      std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+    std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
 
   return Status{};
 }
diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
index dfe5d59b0..c88bef6d7 100644
--- a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
@@ -53,12 +53,12 @@ namespace
 using namespace arm_compute;
 template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
 void elementwise_op_templ(
-    const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-    OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
-    int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &,
-                          OutputScalarType *, const bool),
-    int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *,
-                     OutputScalarType *))
+  const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+  OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
+  int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &,
+                        OutputScalarType *, const bool),
+  int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *,
+                   OutputScalarType *))
 {
   // Create input windows
   Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -88,26 +88,26 @@ void elementwise_op_templ(
     Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
     Iterator output(out, win);
 
-    execute_window_loop(win,
-                        [&](const Coordinates &) {
-                          auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-                          const auto non_broadcast_input_ptr =
-                              reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-                          const InputScalarType broadcast_value =
-                              *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
-                          int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x,
-                                                    non_broadcast_input_ptr, broadcast_value,
-                                                    output_ptr, !is_broadcast_input_2);
-                          for (; x < window_end_x; ++x)
-                          {
-                            const auto a = *(non_broadcast_input_ptr + x);
-                            *(output_ptr + x) =
-                                (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a,
-                                               !is_broadcast_input_2 ? a : broadcast_value);
-                          }
-                        },
-                        broadcast_input, non_broadcast_input, output);
+    execute_window_loop(
+      win,
+      [&](const Coordinates &) {
+        auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+        const auto non_broadcast_input_ptr =
+          reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+        const InputScalarType broadcast_value =
+          *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+
+        int x =
+          (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                            broadcast_value, output_ptr, !is_broadcast_input_2);
+        for (; x < window_end_x; ++x)
+        {
+          const auto a = *(non_broadcast_input_ptr + x);
+          *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a,
+                                             !is_broadcast_input_2 ? a : broadcast_value);
+        }
+      },
+      broadcast_input, non_broadcast_input, output);
   }
   else
   {
@@ -119,24 +119,23 @@ void elementwise_op_templ(
     Iterator input2(in2, input2_win);
     Iterator output(out, win);
 
-    execute_window_loop(win,
-                        [&](const Coordinates &) {
-                          auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-                          const auto input1_ptr =
-                              reinterpret_cast<const InputScalarType *>(input1.ptr());
-                          const auto input2_ptr =
-                              reinterpret_cast<const InputScalarType *>(input2.ptr());
-
-                          int x = (*neon_func)(window_start_x, window_end_x, window_step_x,
-                                               input1_ptr, input2_ptr, output_ptr);
-                          for (; x < window_end_x; ++x)
-                          {
-                            const auto a = *(input1_ptr + x);
-                            const auto b = *(input2_ptr + x);
-                            *(output_ptr + x) = (*scalar_func)(a, b);
-                          }
-                        },
-                        input1, input2, output);
+    execute_window_loop(
+      win,
+      [&](const Coordinates &) {
+        auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+        const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+        const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+        int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr,
+                             output_ptr);
+        for (; x < window_end_x; ++x)
+        {
+          const auto a = *(input1_ptr + x);
+          const auto b = *(input2_ptr + x);
+          *(output_ptr + x) = (*scalar_func)(a, b);
+        }
+      },
+      input1, input2, output);
   }
 }
 
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
index 32d7d6237..a8464afce 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
@@ -103,8 +103,10 @@ template <BinaryLogicalOperation op>
 inline uint8x16x4_t elementwise_logic_op(const uint8x16x4_t &a, const uint8x16x4_t &b)
 {
   uint8x16x4_t out = {{
-      elementwise_logic_op<op>(a.val[0], b.val[0]), elementwise_logic_op<op>(a.val[1], b.val[1]),
-      elementwise_logic_op<op>(a.val[2], b.val[2]), elementwise_logic_op<op>(a.val[3], b.val[3]),
+    elementwise_logic_op<op>(a.val[0], b.val[0]),
+    elementwise_logic_op<op>(a.val[1], b.val[1]),
+    elementwise_logic_op<op>(a.val[2], b.val[2]),
+    elementwise_logic_op<op>(a.val[3], b.val[3]),
   }};
   return out;
 }
@@ -160,8 +162,8 @@ void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out,
 }
 
 std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func(
-    const ITensor *input1, const ITensor *input2, ITensor *output,
-    std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function)
+  const ITensor *input1, const ITensor *input2, ITensor *output,
+  std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function)
 {
   std::string function_to_call("op_");
   function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
@@ -184,8 +186,8 @@ std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
 configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
   static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = {
-      {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>},
-      {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}};
+    {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>},
+    {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}};
 
   return configure_func(input1, input2, output, map_function);
 }
@@ -223,7 +225,7 @@ Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &inp
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
 
   const TensorShape out_shape =
-      TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+    TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
 
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
                                   "Inputs are not broadcast compatible");
@@ -232,8 +234,8 @@ Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &inp
   if (output.total_size() > 0)
   {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
-        "Wrong shape for output");
+      detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+      "Wrong shape for output");
   }
 
   return Status{};
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
index 12017e543..f935596e6 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
@@ -129,125 +129,125 @@ void NECastBoolKernel::run(const Window &window, const ThreadInfo &info)
     case DataType::S8:
     {
       /* Conversion U8 -> S8 */
-      execute_window_loop(win,
-                          [&](const Coordinates &) {
-                            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-                            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-                            int x = window_start_x;
-                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-                              vst1q_s8(output_ptr + x, vreinterpretq_s8_u8(vandq_u8(
-                                                           texels_u8, vdupq_n_u8(true_val))));
-                            }
-
-                            // Compute left-over elements
-                            for (; x < window_end_x; ++x)
-                            {
-                              *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) & true_val);
-                            }
-                          },
-                          input, output);
+      execute_window_loop(
+        win,
+        [&](const Coordinates &) {
+          const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+          const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+          int x = window_start_x;
+          for (; x <= (window_end_x - window_step_x); x += window_step_x)
+          {
+            const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+            vst1q_s8(output_ptr + x,
+                     vreinterpretq_s8_u8(vandq_u8(texels_u8, vdupq_n_u8(true_val))));
+          }
+
+          // Compute left-over elements
+          for (; x < window_end_x; ++x)
+          {
+            *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) & true_val);
+          }
+        },
+        input, output);
       break;
     }
     case DataType::S16:
     {
       /* Up-conversion U8 -> S16 */
       execute_window_loop(
-          win,
-          [&](const Coordinates &) {
-            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-              const int16x8x2_t texels = {
-                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
-                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
-
-              vst1q_s16(output_ptr + x, texels.val[0]);
-              vst1q_s16(output_ptr + x + 8, texels.val[1]);
-            }
-
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-              *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) & true_val);
-            }
-          },
-          input, output);
+        win,
+        [&](const Coordinates &) {
+          const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+          const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+          int x = window_start_x;
+          for (; x <= (window_end_x - window_step_x); x += window_step_x)
+          {
+            const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+            const int16x8x2_t texels = {
+              {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+               vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+
+            vst1q_s16(output_ptr + x, texels.val[0]);
+            vst1q_s16(output_ptr + x + 8, texels.val[1]);
+          }
+
+          // Compute left-over elements
+          for (; x < window_end_x; ++x)
+          {
+            *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) & true_val);
+          }
+        },
+        input, output);
       break;
     }
     case DataType::S32:
     {
       /* Up-conversion U8 -> S32 */
       execute_window_loop(
-          win,
-          [&](const Coordinates &) {
-            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-              const int16x8x2_t texels = {
-                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
-                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
-
-              vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
-              vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
-              vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
-              vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
-            }
-
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-              *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) & true_val);
-            }
-          },
-          input, output);
+        win,
+        [&](const Coordinates &) {
+          const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+          const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
+
+          int x = window_start_x;
+          for (; x <= (window_end_x - window_step_x); x += window_step_x)
+          {
+            const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+            const int16x8x2_t texels = {
+              {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+               vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+
+            vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
+            vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+            vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+            vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+          }
+
+          // Compute left-over elements
+          for (; x < window_end_x; ++x)
+          {
+            *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) & true_val);
+          }
+        },
+        input, output);
       break;
     }
     case DataType::F32:
     {
       /* Up-conversion U8 -> F32 */
       execute_window_loop(
-          win,
-          [&](const Coordinates &) {
-            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-              const int16x8x2_t texels = {
-                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
-                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
-              vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
-              vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
-              vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
-              vst1q_f32(output_ptr + x + 12,
-                        vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
-            }
-
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-              auto in = static_cast<uint32_t>(*(input_ptr + x) & true_val);
-              *(output_ptr + x) = static_cast<float>(in);
-            }
-          },
-          input, output);
+        win,
+        [&](const Coordinates &) {
+          const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+          const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+          int x = window_start_x;
+          for (; x <= (window_end_x - window_step_x); x += window_step_x)
+          {
+            const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+            const int16x8x2_t texels = {
+              {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+               vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+            vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
+            vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
+            vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
+            vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
+          }
+
+          // Compute left-over elements
+          for (; x < window_end_x; ++x)
+          {
+            auto in = static_cast<uint32_t>(*(input_ptr + x) & true_val);
+            *(output_ptr + x) = static_cast<float>(in);
+          }
+        },
+        input, output);
       break;
     }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -255,86 +255,87 @@ void NECastBoolKernel::run(const Window &window, const ThreadInfo &info)
     {
       /* Up-conversion U8 -> F16 */
       execute_window_loop(
-          win,
-          [&](const Coordinates &) {
-            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-              const int16x8x2_t texels = {
-                  {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
-                   vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
-              vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
-              vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
-            }
-
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-              *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) & true_val);
-            }
-          },
-          input, output);
+        win,
+        [&](const Coordinates &) {
+          const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+          const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+          int x = window_start_x;
+          for (; x <= (window_end_x - window_step_x); x += window_step_x)
+          {
+            const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+            const int16x8x2_t texels = {
+              {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
+               vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
+            vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
+            vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+          }
+
+          // Compute left-over elements
+          for (; x < window_end_x; ++x)
+          {
+            *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) & true_val);
+          }
+        },
+        input, output);
       break;
     }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     case DataType::U8:
     {
       /* Conversion U8 -> S8 */
-      execute_window_loop(win,
-                          [&](const Coordinates &) {
-                            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-                            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-                            int x = window_start_x;
-                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-                            {
-                              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-                              vst1q_u8(output_ptr + x, vandq_u8(texels_u8, vdupq_n_u8(true_val)));
-                            }
-
-                            // Compute left-over elements
-                            for (; x < window_end_x; ++x)
-                            {
-                              *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) & true_val);
-                            }
-                          },
-                          input, output);
+      execute_window_loop(
+        win,
+        [&](const Coordinates &) {
+          const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+          const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+          int x = window_start_x;
+          for (; x <= (window_end_x - window_step_x); x += window_step_x)
+          {
+            const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+            vst1q_u8(output_ptr + x, vandq_u8(texels_u8, vdupq_n_u8(true_val)));
+          }
+
+          // Compute left-over elements
+          for (; x < window_end_x; ++x)
+          {
+            *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) & true_val);
+          }
+        },
+        input, output);
       break;
     }
     case DataType::U16:
     {
       /* Up-conversion U8 -> U16 */
       execute_window_loop(
-          win,
-          [&](const Coordinates &) {
-            const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
-            const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-              const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
-
-              const uint16x8x2_t texels = {{vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool)),
-                                            vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool))}};
-
-              vst1q_u16(output_ptr + x, texels.val[0]);
-              vst1q_u16(output_ptr + x + 8, texels.val[1]);
-            }
-
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-              *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x) & true_val);
-            }
-          },
-          input, output);
+        win,
+        [&](const Coordinates &) {
+          const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+          const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
+
+          int x = window_start_x;
+          for (; x <= (window_end_x - window_step_x); x += window_step_x)
+          {
+            const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
+
+            const uint16x8x2_t texels = {{vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool)),
+                                          vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool))}};
+
+            vst1q_u16(output_ptr + x, texels.val[0]);
+            vst1q_u16(output_ptr + x + 8, texels.val[1]);
+          }
+
+          // Compute left-over elements
+          for (; x < window_end_x; ++x)
+          {
+            *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x) & true_val);
+          }
+        },
+        input, output);
       break;
     }
     default:
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
index 091d38c56..e3a77c6b1 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
@@ -50,7 +50,7 @@
 using namespace arm_compute;
 
 NEEmbeddingLookupKernel::NEEmbeddingLookupKernel()
-    : _input(nullptr), _lookups(nullptr), _output(nullptr)
+  : _input(nullptr), _lookups(nullptr), _output(nullptr)
 {
 }
 
@@ -79,8 +79,8 @@ Status NEEmbeddingLookupKernel::validate(const arm_compute::ITensorInfo *input,
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+    DataType::U32, DataType::S32, DataType::F16, DataType::F32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
 
   ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4);
@@ -119,16 +119,17 @@ void NEEmbeddingLookupKernel::run(const Window &window, const ThreadInfo &info)
   {
     Iterator output_it(_output, out_slice);
 
-    execute_window_loop(out_slice,
-                        [&](const Coordinates &id) {
-                          const int32_t lookup = *reinterpret_cast<int32_t *>(
-                              _lookups->ptr_to_element(Coordinates{id[lookup_dim]}));
-                          Coordinates input_id{id};
-                          input_id.set(lookup_dim, lookup);
-                          memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
-                                 _output->info()->dimension(0) * _output->info()->element_size());
-                        },
-                        output_it);
+    execute_window_loop(
+      out_slice,
+      [&](const Coordinates &id) {
+        const int32_t lookup =
+          *reinterpret_cast<int32_t *>(_lookups->ptr_to_element(Coordinates{id[lookup_dim]}));
+        Coordinates input_id{id};
+        input_id.set(lookup_dim, lookup);
+        memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
+               _output->info()->dimension(0) * _output->info()->element_size());
+      },
+      output_it);
 
   } while (window.slide_window_slice_4D(out_slice));
 }
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
index 93963a504..c9f0799d4 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
@@ -71,7 +71,7 @@ template <typename U> void validate_indices(const ITensor *indices)
 } // namespace
 
 NEGatherKernelEx::NEGatherKernelEx()
-    : _input{}, _indices{}, _axis{}, _indices_rank{}, _output{}, _func{}
+  : _input{}, _indices{}, _axis{}, _indices_rank{}, _output{}, _func{}
 {
 }
 
@@ -85,36 +85,35 @@ inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadIn
 
   Iterator output_it(_output, window);
   execute_window_loop(
-      window,
-      [&](const Coordinates &id) {
-        Coordinates gather_id(id);
-        gather_id.collapse(_indices_rank);
-
-        U new_index;
-        switch (_indices_rank)
-        {
-          case 1:
-            new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0]))));
-            break;
-          case 2:
-            new_index =
-                *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1]))));
-            break;
-          case 3:
-            new_index = *(
-                reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2]))));
-            break;
-          default:
-            ARM_COMPUTE_ERROR("Wrong num of dimensions");
-            break;
-        }
-
-        gather_id.set(0, new_index);
-
-        std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(),
-                    output_it.ptr());
-      },
-      output_it);
+    window,
+    [&](const Coordinates &id) {
+      Coordinates gather_id(id);
+      gather_id.collapse(_indices_rank);
+
+      U new_index;
+      switch (_indices_rank)
+      {
+        case 1:
+          new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0]))));
+          break;
+        case 2:
+          new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1]))));
+          break;
+        case 3:
+          new_index =
+            *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2]))));
+          break;
+        default:
+          ARM_COMPUTE_ERROR("Wrong num of dimensions");
+          break;
+      }
+
+      gather_id.set(0, new_index);
+
+      std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(),
+                  output_it.ptr());
+    },
+    output_it);
 }
 
 template <typename U>
@@ -130,37 +129,36 @@ void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &inf
 
   Iterator output_it(_output, output_window);
   execute_window_loop(
-      output_window,
-      [&](const Coordinates &id) {
-        Coordinates gather_id(id);
-        gather_id.collapse(_indices_rank, _axis);
-
-        U new_index;
-        switch (_indices_rank)
-        {
-          case 1:
-            new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis]))));
-            break;
-          case 2:
-            new_index = *(reinterpret_cast<U *>(
-                _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1]))));
-            break;
-          case 3:
-            new_index = *(reinterpret_cast<U *>(
-                _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2]))));
-            break;
-          default:
-            ARM_COMPUTE_ERROR("Wrong num of dimensions");
-            break;
-        }
-
-        gather_id.set(_axis, new_index);
-
-        std::copy_n(_input->ptr_to_element(gather_id),
-                    _input->info()->dimension(0) * _output->info()->element_size(),
-                    output_it.ptr());
-      },
-      output_it);
+    output_window,
+    [&](const Coordinates &id) {
+      Coordinates gather_id(id);
+      gather_id.collapse(_indices_rank, _axis);
+
+      U new_index;
+      switch (_indices_rank)
+      {
+        case 1:
+          new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis]))));
+          break;
+        case 2:
+          new_index = *(
+            reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1]))));
+          break;
+        case 3:
+          new_index = *(reinterpret_cast<U *>(
+            _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2]))));
+          break;
+        default:
+          ARM_COMPUTE_ERROR("Wrong num of dimensions");
+          break;
+      }
+
+      gather_id.set(_axis, new_index);
+
+      std::copy_n(_input->ptr_to_element(gather_id),
+                  _input->info()->dimension(0) * _output->info()->element_size(), output_it.ptr());
+    },
+    output_it);
 }
 
 void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, ITensor *output,
@@ -170,8 +168,8 @@ void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, I
   ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() > 3);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+    DataType::U32, DataType::S32, DataType::F16, DataType::F32);
 
   _input = input;
   _indices = indices;
@@ -217,7 +215,7 @@ void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, I
   }
   // Output auto initialization if not yet initialized
   TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
-      input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
+    input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
   auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
 
   // Create window
@@ -243,15 +241,15 @@ Status NEGatherKernelEx::validate(const ITensorInfo *input, const ITensorInfo *i
   ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions()));
   ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+    DataType::U32, DataType::S32, DataType::F16, DataType::F32);
 
   if (output->total_size() != 0)
   {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
-        input->tensor_shape(), indices->tensor_shape(), axis);
+      input->tensor_shape(), indices->tensor_shape(), axis);
     ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
   }
 
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
index 30787c0a4..52b40e767 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
@@ -57,7 +57,7 @@ constexpr size_t NOT_HIT = 0xFFFFFFFF;
 } // namespace
 
 NEHashtableLookupKernel::NEHashtableLookupKernel()
-    : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr}
+  : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr}
 {
 }
 
@@ -66,7 +66,7 @@ void NEHashtableLookupKernel::configure(const ITensor *lookups, const ITensor *k
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
+    validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info()));
 
   _lookups = lookups;
   _keys = keys;
@@ -92,8 +92,8 @@ Status NEHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITens
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+    DataType::U32, DataType::S32, DataType::F16, DataType::F32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32);
   ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32);
 
@@ -134,8 +134,8 @@ void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info)
 
   const size_t lookup_dim = _output->info()->num_dimensions() - 1;
   const int const_0 = _output->info()->data_type() == DataType::QASYMM8
-                          ? _output->info()->quantization_info().uniform().offset
-                          : 0;
+                        ? _output->info()->quantization_info().uniform().offset
+                        : 0;
 
   std::unordered_map<int32_t, size_t> key_index_map;
   for (size_t n = 0; n < _keys->info()->dimension(0); ++n)
@@ -174,24 +174,24 @@ void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info)
   {
     Iterator output_it(_output, out_slice);
 
-    execute_window_loop(out_slice,
-                        [&](const Coordinates &id) {
-                          const auto lookup = lookup_indices.at(id[lookup_dim]);
-                          if (lookup == NOT_HIT)
-                          {
-                            memset(output_it.ptr(), const_0,
-                                   _output->info()->dimension(0) * _output->info()->element_size());
-                          }
-                          else
-                          {
-                            Coordinates input_id{id};
-                            input_id.set(lookup_dim, lookup);
-                            memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
-                                   _output->info()->dimension(0) * _output->info()->element_size());
-                          }
-
-                        },
-                        output_it);
+    execute_window_loop(
+      out_slice,
+      [&](const Coordinates &id) {
+        const auto lookup = lookup_indices.at(id[lookup_dim]);
+        if (lookup == NOT_HIT)
+        {
+          memset(output_it.ptr(), const_0,
+                 _output->info()->dimension(0) * _output->info()->element_size());
+        }
+        else
+        {
+          Coordinates input_id{id};
+          input_id.set(lookup_dim, lookup);
+          memcpy(output_it.ptr(), _input->ptr_to_element(input_id),
+                 _output->info()->dimension(0) * _output->info()->element_size());
+        }
+      },
+      output_it);
 
   } while (window.slide_window_slice_4D(out_slice));
 }
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
index 49adf1462..4dc0f5535 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
@@ -63,7 +63,7 @@ void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma
 {
   /** NEON vector tag type. */
   using ExactTagType =
-      typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+    typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
 
   // Clear X/Y dimensions on execution window as we handle the planes manually
   Window win = window;
@@ -73,107 +73,107 @@ void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma
   constexpr int window_step_x = 16 / sizeof(T);
   const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
   const auto channel_idx =
-      get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+    get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
 
   Iterator input_it(input, win);
   execute_window_loop(
-      win,
-      [&](const Coordinates &id) {
-        Window win_plane = window;
-        win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
-        win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
-
-        Iterator input_plane_it(input, win_plane);
-        Iterator output_plane_it(output, win_plane);
-
-        auto sum_h_w = static_cast<T>(0.f);
-        auto sum_squares_h_w = static_cast<T>(0.f);
-
-        execute_window_loop(
-            win_plane,
-            [&](const Coordinates &) {
-              const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
-
-              auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-              auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-
-              // Compute S elements per iteration
-              int x = window.x().start();
-              for (; x <= (window.x().end() - window_step_x); x += window_step_x)
-              {
-                auto vec_input_val = wrapper::vloadq(input_ptr + x);
-                vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val);
-                vec_sum_squares_h_w =
-                    wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val));
-              }
-
-              auto vec2_sum_h_w =
-                  wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
-              auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w),
-                                                         wrapper::vgetlow(vec_sum_squares_h_w));
-              for (int i = 0; i < window_step_x / 4; ++i)
-              {
-                vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
-                vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
-              }
-              sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
-              sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
-
-              // Compute left-over elements
-              for (; x < window.x().end(); ++x)
-              {
-                const auto value = *(input_ptr + x);
-                sum_h_w += value;
-                sum_squares_h_w += value * value;
-              }
-            },
-            input_plane_it, output_plane_it);
-
-        const auto mean_h_w = sum_h_w / elements_plane;
-        const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
-
-        auto gamma_val = 1.0f;
-        if (gamma != nullptr)
-        {
-          gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]}));
-        }
-        const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon);
-        const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{});
-        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{});
-        auto beta_val = 0.0f;
-        if (beta != nullptr)
-        {
-          beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]}));
-        }
-        const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{});
-
-        execute_window_loop(
-            win_plane,
-            [&](const Coordinates &) {
-              auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr());
-              auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
-
-              // Compute S elements per iteration
-              int x = window.x().start();
-              auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
-              for (; x <= (window.x().end() - window_step_x); x += window_step_x)
-              {
-                vec_val = wrapper::vloadq(input_ptr + x);
-                vec_val = wrapper::vadd(
-                    wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta);
-                wrapper::vstore(output_ptr + x, vec_val);
-              }
-
-              // Compute left-over elements
-              for (; x < window.x().end(); ++x)
-              {
-                *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val;
-              }
-            },
-            input_plane_it, output_plane_it);
-      },
-      input_it);
+    win,
+    [&](const Coordinates &id) {
+      Window win_plane = window;
+      win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+      win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+      win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
+
+      Iterator input_plane_it(input, win_plane);
+      Iterator output_plane_it(output, win_plane);
+
+      auto sum_h_w = static_cast<T>(0.f);
+      auto sum_squares_h_w = static_cast<T>(0.f);
+
+      execute_window_loop(
+        win_plane,
+        [&](const Coordinates &) {
+          const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
+
+          auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+          auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+
+          // Compute S elements per iteration
+          int x = window.x().start();
+          for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+          {
+            auto vec_input_val = wrapper::vloadq(input_ptr + x);
+            vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val);
+            vec_sum_squares_h_w =
+              wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val));
+          }
+
+          auto vec2_sum_h_w =
+            wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+          auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w),
+                                                     wrapper::vgetlow(vec_sum_squares_h_w));
+          for (int i = 0; i < window_step_x / 4; ++i)
+          {
+            vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+            vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+          }
+          sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+          sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+          // Compute left-over elements
+          for (; x < window.x().end(); ++x)
+          {
+            const auto value = *(input_ptr + x);
+            sum_h_w += value;
+            sum_squares_h_w += value * value;
+          }
+        },
+        input_plane_it, output_plane_it);
+
+      const auto mean_h_w = sum_h_w / elements_plane;
+      const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+      auto gamma_val = 1.0f;
+      if (gamma != nullptr)
+      {
+        gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]}));
+      }
+      const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon);
+      const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{});
+      const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{});
+      auto beta_val = 0.0f;
+      if (beta != nullptr)
+      {
+        beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]}));
+      }
+      const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{});
+
+      execute_window_loop(
+        win_plane,
+        [&](const Coordinates &) {
+          auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr());
+          auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
+
+          // Compute S elements per iteration
+          int x = window.x().start();
+          auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
+          for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+          {
+            vec_val = wrapper::vloadq(input_ptr + x);
+            vec_val = wrapper::vadd(
+              wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta);
+            wrapper::vstore(output_ptr + x, vec_val);
+          }
+
+          // Compute left-over elements
+          for (; x < window.x().end(); ++x)
+          {
+            *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val;
+          }
+        },
+        input_plane_it, output_plane_it);
+    },
+    input_it);
 }
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
@@ -199,8 +199,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
   {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index(
-                                        input->data_layout(), DataLayoutDimension::CHANNEL)) !=
-                                        gamma->dimension(0),
+                                      input->data_layout(), DataLayoutDimension::CHANNEL)) !=
+                                      gamma->dimension(0),
                                     "Gamma's size must be the same as size of input's channel");
   }
 
@@ -208,8 +208,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
   {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index(
-                                        input->data_layout(), DataLayoutDimension::CHANNEL)) !=
-                                        beta->dimension(0),
+                                      input->data_layout(), DataLayoutDimension::CHANNEL)) !=
+                                      beta->dimension(0),
                                     "Beta's size must be the same as size of input's channel");
   }
 
@@ -234,8 +234,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
 } // namespace
 
 NEInstanceNormalizationLayerKernelEx::NEInstanceNormalizationLayerKernelEx()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr),
-      _epsilon(1e-12)
+  : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr),
+    _epsilon(1e-12)
 {
 }
 
@@ -251,7 +251,7 @@ void NEInstanceNormalizationLayerKernelEx::configure(ITensor *input, ITensor *ou
   _epsilon = epsilon;
 
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon));
+    validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon));
 
   if (_input->info()->data_type() == DataType::F32)
   {
@@ -282,7 +282,7 @@ Status NEInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input,
 {
   ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
   ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
-      input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+    input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
   return Status{};
 }
 
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
index b92130cec..ad4728175 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
@@ -123,15 +123,17 @@ inline float32x4x4_t multiply_scale_vec(const int32x4x4_t &iv, float scale)
   const float32x4_t vscale = vdupq_n_f32(scale);
 
   const float32x4x4_t ret = {{
-      vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale),
-      vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale),
+    vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale),
+    vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale),
+    vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale),
+    vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale),
   }};
   return ret;
 }
 } // namespace
 
 NEMultiplyScaleFactorKernel::NEMultiplyScaleFactorKernel()
-    : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
+  : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f)
 {
 }
 
@@ -140,7 +142,7 @@ void NEMultiplyScaleFactorKernel::configure(const ITensor *input, const ITensor
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), scale_factor->info(), output->info()));
+    validate_arguments(input->info(), scale_factor->info(), output->info()));
 
   _input = input;
   _scale_factor = scale_factor;
@@ -180,25 +182,25 @@ template <typename T> void NEMultiplyScaleFactorKernel::multiply(const Window &w
   Iterator output(_output, win_collapsed);
   win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
   execute_window_loop(
-      win_collapsed,
-      [&](const Coordinates &id) {
-        auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()}));
-        scale *= _multiplier;
-
-        const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
-        auto output_ptr = reinterpret_cast<T *>(output.ptr());
-        int x = window_start_x;
-        for (; x <= (window_end_x - window_step); x += window_step)
-        {
-          store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale));
-        }
-        // Compute left-over elements
-        for (; x < window_end_x; ++x)
-        {
-          output_ptr[x] = input_ptr[x] * scale;
-        }
-      },
-      input, output);
+    win_collapsed,
+    [&](const Coordinates &id) {
+      auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()}));
+      scale *= _multiplier;
+
+      const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
+      auto output_ptr = reinterpret_cast<T *>(output.ptr());
+      int x = window_start_x;
+      for (; x <= (window_end_x - window_step); x += window_step)
+      {
+        store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale));
+      }
+      // Compute left-over elements
+      for (; x < window_end_x; ++x)
+      {
+        output_ptr[x] = input_ptr[x] * scale;
+      }
+    },
+    input, output);
 }
 
 void NEMultiplyScaleFactorKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
index 0a11eb509..0daff5c6a 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
@@ -101,8 +101,8 @@ bool isOnValue(U index, U depth)
 } // namespace
 
 NEOneHotKernel::NEOneHotKernel()
-    : _indices{nullptr}, _depth{nullptr}, _on_value{nullptr}, _off_value{nullptr}, _axis{-1},
-      _output{nullptr}, _func{}
+  : _indices{nullptr}, _depth{nullptr}, _on_value{nullptr},
+    _off_value{nullptr}, _axis{-1}, _output{nullptr}, _func{}
 {
 }
 
@@ -117,22 +117,22 @@ void NEOneHotKernel::onehot_0_axis(const Window &window, const ThreadInfo &info)
   Iterator output_it(_output, output_window);
   const U off_value = *reinterpret_cast<U *>(_off_value->buffer());
   execute_window_loop(
-      output_window,
-      [&](const Coordinates &id) {
-        std::fill_n(output_it.ptr(),
-                    _output->info()->dimension(0) * _output->info()->element_size(), off_value);
-        Coordinates indices_id(id);
-        indices_id.remove(0);
-        const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id)));
-        if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer()))))
-        {
-          Coordinates onehot_id(id);
-          onehot_id.set(0, new_index);
-          std::copy_n(_on_value->buffer(), _output->info()->element_size(),
-                      _output->ptr_to_element(onehot_id));
-        }
-      },
-      output_it);
+    output_window,
+    [&](const Coordinates &id) {
+      std::fill_n(output_it.ptr(), _output->info()->dimension(0) * _output->info()->element_size(),
+                  off_value);
+      Coordinates indices_id(id);
+      indices_id.remove(0);
+      const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id)));
+      if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer()))))
+      {
+        Coordinates onehot_id(id);
+        onehot_id.set(0, new_index);
+        std::copy_n(_on_value->buffer(), _output->info()->element_size(),
+                    _output->ptr_to_element(onehot_id));
+      }
+    },
+    output_it);
 }
 
 template <typename U>
@@ -142,22 +142,22 @@ inline void NEOneHotKernel::onehot_n_axis(const Window &window, const ThreadInfo
   // Validate that the indices are not negative
   validate_depth<U>(_depth, _output, _axis);
   Iterator output_it(_output, window);
-  execute_window_loop(window,
-                      [&](const Coordinates &id) {
-                        Coordinates indices_id(id);
-                        indices_id.remove(_axis);
-                        const U new_index =
-                            *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id)));
-                        if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer()))))
-                        {
-                          Coordinates onehot_id(id);
-                          onehot_id.set(_axis, new_index);
-                          std::copy_n(static_cast<U>(id[_axis]) == new_index ? _on_value->buffer()
-                                                                             : _off_value->buffer(),
-                                      _output->info()->element_size(), output_it.ptr());
-                        }
-                      },
-                      output_it);
+  execute_window_loop(
+    window,
+    [&](const Coordinates &id) {
+      Coordinates indices_id(id);
+      indices_id.remove(_axis);
+      const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id)));
+      if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer()))))
+      {
+        Coordinates onehot_id(id);
+        onehot_id.set(_axis, new_index);
+        std::copy_n(static_cast<U>(id[_axis]) == new_index ? _on_value->buffer()
+                                                           : _off_value->buffer(),
+                    _output->info()->element_size(), output_it.ptr());
+      }
+    },
+    output_it);
 }
 
 void NEOneHotKernel::configure(const ITensor *indices, const ITensor *depth,
@@ -215,7 +215,7 @@ Status NEOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *d
                                 const ITensorInfo *output, int axis)
 {
   ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_arguments(indices, depth, on_value, off_value, output, axis));
+    validate_arguments(indices, depth, on_value, off_value, output, axis));
   return Status{};
 }
 
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
index 5841f1d69..2306228d5 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
@@ -107,19 +107,15 @@ inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv,
 
   const int32x4x4_t rf = {{
 #ifdef __aarch64__
-      vminq_s32(vposend,
-                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
-      vminq_s32(vposend,
-                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
-      vminq_s32(vposend,
-                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
-      vminq_s32(vposend,
-                vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
+    vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
+    vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
+    vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
+    vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
 #else  //__aarch64__
-      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
-      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
-      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
-      vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
+    vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))),
+    vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))),
+    vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))),
+    vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))),
 #endif //__aarch64__
   }};
   const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
@@ -129,7 +125,7 @@ inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv,
 } // namespace
 
 NEQuantizationSymmetricKernel::NEQuantizationSymmetricKernel()
-    : _input(nullptr), _output(nullptr), _scale_factor(nullptr)
+  : _input(nullptr), _output(nullptr), _scale_factor(nullptr)
 {
 }
 
@@ -138,7 +134,7 @@ void NEQuantizationSymmetricKernel::configure(const ITensor *input, ITensor *out
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
   ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input->info(), output->info(), scale_factor->info()));
+    validate_arguments(input->info(), output->info(), scale_factor->info()));
 
   _input = input;
   _output = output;
@@ -182,40 +178,40 @@ template <typename T> void NEQuantizationSymmetricKernel::quantize(const Window
   const auto dim_x = _input->info()->dimension(0);
   win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
   execute_window_loop(
-      win_collapsed,
-      [&](const Coordinates &id) {
-        const auto start = reinterpret_cast<const T *>(input.ptr());
-        const auto min_max = std::minmax_element(start, start + dim_x);
-        const auto int8_scale = 127;
-        auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second));
-        if (range == 0)
-        {
-          *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1;
-          range = 1;
-        }
-        else
-        {
-          *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale;
-        }
-        const auto scale_factor_inv = int8_scale / range;
-
-        auto input_ptr = reinterpret_cast<const T *>(input.ptr());
-        auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-        int x = window_start_x;
-        for (; x <= (window_end_x - window_step); x += window_step)
-        {
-          wrapper::vstore(&output_ptr[x],
-                          vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale));
-        }
-        // Compute left-over elements
-        for (; x < window_end_x; ++x)
-        {
-          int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy);
-          quantized = std::min(int8_scale, std::max(quantized, -int8_scale));
-          output_ptr[x] = static_cast<int8_t>(quantized);
-        }
-      },
-      input, output);
+    win_collapsed,
+    [&](const Coordinates &id) {
+      const auto start = reinterpret_cast<const T *>(input.ptr());
+      const auto min_max = std::minmax_element(start, start + dim_x);
+      const auto int8_scale = 127;
+      auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second));
+      if (range == 0)
+      {
+        *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1;
+        range = 1;
+      }
+      else
+      {
+        *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale;
+      }
+      const auto scale_factor_inv = int8_scale / range;
+
+      auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+      auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+      int x = window_start_x;
+      for (; x <= (window_end_x - window_step); x += window_step)
+      {
+        wrapper::vstore(&output_ptr[x],
+                        vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale));
+      }
+      // Compute left-over elements
+      for (; x < window_end_x; ++x)
+      {
+        int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy);
+        quantized = std::min(int8_scale, std::max(quantized, -int8_scale));
+        output_ptr[x] = static_cast<int8_t>(quantized);
+      }
+    },
+    input, output);
 }
 
 void NEQuantizationSymmetricKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
index 267228eac..b02a48ef2 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
@@ -50,8 +50,8 @@
 namespace arm_compute
 {
 CLArgMinMaxLayerEx::CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(),
-      _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis()
+  : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(),
+    _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis()
 {
 }
 
@@ -60,13 +60,13 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX &&
-                                      op != ReductionOperation::ARG_IDX_MIN,
+                                    op != ReductionOperation::ARG_IDX_MIN,
                                   "Invalid reduction operation");
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions),
                                   "Reduction axis greater than max number of dimensions");
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
   const unsigned int num_of_stages =
-      calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
+    calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
 
   DataType output_data_type = DataType::S32;
   TensorInfo not_reshaped_output;
@@ -76,9 +76,9 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT
   if (output->total_size() != 0)
   {
     output_data_type = output->data_type();
-    const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
-        arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis,
-                                                                   false));
+    const TensorInfo expected_output_shape =
+      output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(
+        input->tensor_shape(), axis, false));
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
   }
 
@@ -87,9 +87,9 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT
   auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type,
                                   int num_channels, QuantizationInfo qinfo) {
     ti.set_data_type(data_type)
-        .set_tensor_shape(shape)
-        .set_num_channels(num_channels)
-        .set_quantization_info(qinfo);
+      .set_tensor_shape(shape)
+      .set_num_channels(num_channels)
+      .set_quantization_info(qinfo);
   };
 
   initialize_tensorinfo(not_reshaped_output, shape_before_reshape, output_data_type,
@@ -98,7 +98,7 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT
   if (num_of_stages == 1)
   {
     ARM_COMPUTE_RETURN_ON_ERROR(
-        CLArgMinMaxLayerKernelEx::validate(input, nullptr, &not_reshaped_output, axis, op));
+      CLArgMinMaxLayerKernelEx::validate(input, nullptr, &not_reshaped_output, axis, op));
   }
   else
   {
@@ -118,19 +118,19 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT
 
     // Validate ReductionOperation only on first kernel
     ARM_COMPUTE_RETURN_ON_ERROR(
-        CLArgMinMaxLayerKernelEx::validate(input, nullptr, &sums_vector[0], axis, op));
+      CLArgMinMaxLayerKernelEx::validate(input, nullptr, &sums_vector[0], axis, op));
 
     // Validate ReductionOperation on intermediate stages
     for (unsigned int i = 1; i < num_of_stages - 1; ++i)
     {
-      ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(input, &sums_vector[i - 1],
-                                                                     &sums_vector[i], axis, op));
+      ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArgMinMaxLayerKernelEx::validate(input, &sums_vector[i - 1], &sums_vector[i], axis, op));
     }
 
     // Validate ReductionOperation on the last stage
     const unsigned int last_stage = num_of_stages - 1;
     ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(
-        input, &sums_vector[last_stage - 1], &not_reshaped_output, axis, op));
+      input, &sums_vector[last_stage - 1], &not_reshaped_output, axis, op));
   }
   ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&not_reshaped_output, output));
   return Status{};
@@ -144,16 +144,16 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *
   _reduction_axis = axis;
 
   const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(
-      input->info()->tensor_shape(), axis, false);
+    input->info()->tensor_shape(), axis, false);
   DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN)
-                                  ? DataType::S32
-                                  : output->info()->data_type();
+                                ? DataType::S32
+                                : output->info()->data_type();
   auto_init_if_empty(*output->info(), input->info()
-                                          ->clone()
-                                          ->set_tensor_shape(output_shape)
-                                          .set_data_type(output_data_type)
-                                          .reset_padding()
-                                          .set_is_resizable(true));
+                                        ->clone()
+                                        ->set_tensor_shape(output_shape)
+                                        .set_data_type(output_data_type)
+                                        .reset_padding()
+                                        .set_is_resizable(true));
 
   // Configure reduction operation kernels
   _reduction_kernels_vector.resize(_num_of_stages);
@@ -166,11 +166,11 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *
     TensorShape output_shape{input->info()->tensor_shape()};
     output_shape.set(axis, 1);
     auto_init_if_empty(*_not_reshaped_output.info(), input->info()
-                                                         ->clone()
-                                                         ->set_tensor_shape(output_shape)
-                                                         .set_data_type(output_data_type)
-                                                         .reset_padding()
-                                                         .set_is_resizable(true));
+                                                       ->clone()
+                                                       ->set_tensor_shape(output_shape)
+                                                       .set_data_type(output_data_type)
+                                                       .reset_padding()
+                                                       .set_is_resizable(true));
     _not_reshaped_output.info()->set_tensor_shape(output_shape);
     _reduction_kernels_vector[0].configure(input, nullptr, &_not_reshaped_output, axis, op);
   }
@@ -182,7 +182,7 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *
     {
       shape.set(0, ceil(shape.x() / 128.f));
       _results_vector[i].allocator()->init(
-          input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type));
+        input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type));
     }
 
     // Apply ReductionOperation only on first kernel
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
index 3dede0562..6359b4bcb 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
@@ -53,16 +53,10 @@ namespace arm_compute
 using namespace arm_compute::misc::shape_calculator;
 
 CLDirectTransposeConvLayer::CLDirectTransposeConvLayer(
-    std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _scale_f(),
-      _conv_f(),
-      _flip_weights(),
-      _scaled_output(),
-      _original_weights(nullptr),
-      _weights_flipped(),
-      _flip_axis(),
-      _is_prepared(false)
+  std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+  : _memory_group(std::move(memory_manager)), _scale_f(), _conv_f(), _flip_weights(),
+    _scaled_output(), _original_weights(nullptr), _weights_flipped(), _flip_axis(),
+    _is_prepared(false)
 {
 }
 
@@ -74,7 +68,7 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen
 {
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-      input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+    input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
   const DataLayout data_layout = input->data_layout();
 
@@ -86,8 +80,8 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen
   ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
 
   auto out_dims = transposeconv_output_dimensions(
-      input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
-      weights->dimension(idx_h), info, invalid_right, invalid_bottom);
+    input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
+    weights->dimension(idx_h), info, invalid_right, invalid_bottom);
 
   const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
 
@@ -117,19 +111,19 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen
   unsigned int pad_right = 0;
   unsigned int pad_top = 0;
   unsigned int pad_bottom = 0;
-  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
-      pad_bottom);
+  const TensorShape scale_out_shape =
+    compute_transposeconv_upsampled_shape(*input, *weights, info, out_dims, invalid_right,
+                                          invalid_bottom, pad_left, pad_right, pad_top, pad_bottom);
   TensorInfo scale_out_info(input->clone()
-                                ->set_is_resizable(true)
-                                .reset_padding()
-                                .set_tensor_shape(scale_out_shape)
-                                .set_data_layout(data_layout));
+                              ->set_is_resizable(true)
+                              .reset_padding()
+                              .set_tensor_shape(scale_out_shape)
+                              .set_data_layout(data_layout));
   const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
   ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
-  ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
-                                                           conv_info, weights_info));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+    CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
 
   return Status{};
 }
@@ -171,22 +165,22 @@ void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_conte
   _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
 
   auto out_dims = transposeconv_output_dimensions(
-      input->info()->dimension(idx_w), input->info()->dimension(idx_h),
-      weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
-      invalid_bottom);
+    input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+    weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
+    invalid_bottom);
 
   const TensorShape output_shape =
-      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+    compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
 
   // Output auto initialization if not yet initialized
   auto_init_if_empty(
-      *output->info(),
-      input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+    *output->info(),
+    input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
 
   // Perform validation step
   ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate(
-      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
-      info, invalid_right, invalid_bottom));
+    input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info,
+    invalid_right, invalid_bottom));
 
   _is_prepared = weights_info.retain_internal_weights();
 
@@ -195,8 +189,8 @@ void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_conte
   // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
   // to match output shape
   const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
-      pad_right, pad_top, pad_bottom);
+    *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+    pad_right, pad_top, pad_bottom);
 
   TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
                             input->info()->quantization_info());
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
index 01989461e..79d0929a9 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
@@ -60,7 +60,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
   ARM_COMPUTE_UNUSED(weights);
   ARM_COMPUTE_UNUSED(output);
   ARM_COMPUTE_RETURN_ON_ERROR(
-      CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
+    CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
 
   return Status{};
 }
@@ -80,12 +80,12 @@ Status CLFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *in
 }
 
 CLFullyConnectedHybridLayer::CLFullyConnectedHybridLayer(
-    std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(),
-      _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(),
-      _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(),
-      _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false),
-      _original_weights(nullptr)
+  std::shared_ptr<IMemoryManager> memory_manager)
+  : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(),
+    _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(),
+    _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(),
+    _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false),
+    _original_weights(nullptr)
 {
 }
 void CLFullyConnectedHybridLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights,
@@ -107,8 +107,8 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen
 
   // Perform validate step
   ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedHybridLayer::validate(
-      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
-      fc_info));
+    input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+    fc_info));
 
   _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
   _accumulate_biases = false;
@@ -140,10 +140,10 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen
   bool is_fc_after_conv = false;
   if (is_batched_fc_layer)
   {
-    is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
-                       (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                   input->info()->tensor_shape().cend(),
-                                   output->info()->tensor_shape().cbegin() + 1));
+    is_fc_after_conv =
+      (TensorShape::num_max_dimensions >= 4) &&
+      (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(),
+                  output->info()->tensor_shape().cbegin() + 1));
   }
   else
   {
@@ -158,28 +158,28 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen
   {
     // Reshape the weights
     _reshape_weights_output.allocator()->init(
-        weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-            compute_transposed_shape(*weights->info())));
+      weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+        compute_transposed_shape(*weights->info())));
     _reshape_weights_kernel.configure(weights_to_use, &_reshape_weights_output);
     weights_to_use = &_reshape_weights_output;
   }
 
   // Extract scale factor
   _scale_factor.allocator()->init(
-      TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type()));
+    TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type()));
   _memory_group.manage(&_scale_factor);
   _scale_factor_kernel.configure(input, &_scale_factor);
 
   // Quantize input
   _quantized_input.allocator()->init(
-      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
-          DataType::QASYMM8_SIGNED));
+    input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+      DataType::QASYMM8_SIGNED));
   _memory_group.manage(&_quantized_input);
   _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input);
 
   // GEMMLowp
   _gemmlowp_output.allocator()->init(
-      output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+    output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
   _memory_group.manage(&_gemmlowp_output);
   configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output,
                fc_info.retain_internal_weights);
@@ -209,15 +209,15 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   const GPUTarget gpu_target = CLScheduler::get().target();
 
   const ITensorInfo &reshaped_weights =
-      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          compute_transposed_shape(*weights)));
+    TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+      compute_transposed_shape(*weights)));
 
   // Configure accumulate biases kernel for non quantized asymmetric types
   if (biases != nullptr)
   {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
     ARM_COMPUTE_RETURN_ON_ERROR(
-        CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target));
+      CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target));
   }
 
   // With the Fully Connected layer we can have 4 different cases:
@@ -247,33 +247,32 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   {
     // Validate reshape weights kernel
     ARM_COMPUTE_RETURN_ON_ERROR(
-        CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
+      CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
     weights_to_use = &reshaped_weights;
   }
 
   // Validate Scale factor kernel
   const ITensorInfo &scale_factor =
-      TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type());
+    TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type());
   ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor));
 
   // Validate quantization symm8 kernel
-  const ITensorInfo &quantized_input =
-      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
-          DataType::QASYMM8_SIGNED));
+  const ITensorInfo &quantized_input = TensorInfo(
+    input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::QASYMM8_SIGNED));
   ARM_COMPUTE_RETURN_ON_ERROR(
-      CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input));
+    CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input));
 
   // Fully Connected layer after a Fully Connected Layer without batches
   ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
 
   // Validate matrix multiply kernel
   const ITensorInfo &gemmlowp_output = TensorInfo(
-      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+    output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
   ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output));
 
   // Multiply scale
   ARM_COMPUTE_RETURN_ON_ERROR(
-      CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output));
+    CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output));
 
   return Status{};
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
index 2ff4b9659..13d3acbac 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
@@ -79,7 +79,7 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorIn
     int output_multiplier = 0;
     int output_shift = 0;
     ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(
-        multiplier, &output_multiplier, &output_shift));
+      multiplier, &output_multiplier, &output_shift));
 
     // Set the GEMMLowp output stage info
     gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
@@ -99,7 +99,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 {
   GEMMLowpOutputStageInfo gemmlowp_output_stage;
   ARM_COMPUTE_RETURN_ON_ERROR(
-      construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage));
+    construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage));
 
   const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
                                        false, // is_b_reshaped
@@ -125,14 +125,14 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
     // Validate gemmlowp function
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(
-        &input.clone()->set_quantization_info(input_quantization_info),
-        &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output,
-        gemm_info));
+      &input.clone()->set_quantization_info(input_quantization_info),
+      &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output,
+      gemm_info));
   }
   else
   {
     ARM_COMPUTE_RETURN_ON_ERROR(
-        CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info));
+      CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info));
   }
 
   return Status{};
@@ -154,12 +154,12 @@ Status CLFullyConnectedLayerReshapeWeightsEx::validate(const ITensorInfo *input,
 
 CLFullyConnectedLayerEx::CLFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager,
                                                  IWeightsManager *weights_manager)
-    : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(),
-      _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(),
-      _reshape_weights_function(), _mm_gemm(memory_manager, weights_manager),
-      _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(),
-      _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true),
-      _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
+  : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(),
+    _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(),
+    _reshape_weights_function(), _mm_gemm(memory_manager, weights_manager),
+    _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(),
+    _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true),
+    _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
 {
 }
 void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTensor *weights,
@@ -190,9 +190,9 @@ void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTens
     const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
 
     input->info()->set_quantization_info(QuantizationInfo(
-        input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+      input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
     weights->info()->set_quantization_info(QuantizationInfo(
-        weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+      weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
     // Configure gemmlowp function
     _mm_gemmlowp.configure(input, weights, bias, output, gemm_info);
@@ -214,8 +214,8 @@ void CLFullyConnectedLayerEx::configure_conv_fc(const ICLTensor *input, const IC
                                                 const FullyConnectedLayerInfo &fc_info)
 {
   ARM_COMPUTE_ERROR_ON(
-      (weights->info()->dimension(1) !=
-       (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+    (weights->info()->dimension(1) !=
+     (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
 
   // If the fully connected layer is called after a convolution layer, the input tensor must be
   // linearized
@@ -223,11 +223,11 @@ void CLFullyConnectedLayerEx::configure_conv_fc(const ICLTensor *input, const IC
   // Initialize output tensor for flatten
   TensorShape shape_flatten = compute_flatten_shape(input->info());
   _flatten_output.allocator()->init(input->info()
-                                        ->clone()
-                                        ->set_is_resizable(true)
-                                        .reset_padding()
-                                        .set_tensor_shape(shape_flatten)
-                                        .set_data_layout(DataLayout::NCHW));
+                                      ->clone()
+                                      ->set_is_resizable(true)
+                                      .reset_padding()
+                                      .set_tensor_shape(shape_flatten)
+                                      .set_data_layout(DataLayout::NCHW));
 
   // Configure flatten kernel
   _memory_group.manage(&_flatten_output);
@@ -258,8 +258,8 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor
 
   // Perform validate step
   ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayerEx::validate(
-      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
-      fc_info));
+    input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+    fc_info));
 
   _are_weights_converted = true;
   _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
@@ -285,10 +285,10 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor
   const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
   if (is_batched_fc_layer)
   {
-    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
-                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                    input->info()->tensor_shape().cend(),
-                                    output->info()->tensor_shape().cbegin() + 1));
+    _is_fc_after_conv =
+      (TensorShape::num_max_dimensions >= 4) &&
+      (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(),
+                  output->info()->tensor_shape().cbegin() + 1));
   }
   else
   {
@@ -302,7 +302,7 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor
     {
       _reshape_weights_managed_function.configure(weights);
       weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(
-          _weights_manager->acquire(weights, &_reshape_weights_managed_function));
+        _weights_manager->acquire(weights, &_reshape_weights_managed_function));
     }
     else
     {
@@ -320,7 +320,7 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor
       _convert_weights_managed.configure(weights_to_use, input->info()->tensor_shape(),
                                          fc_info.weights_trained_layout);
       weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>(
-          _weights_manager->acquire(weights, &_convert_weights_managed));
+        _weights_manager->acquire(weights, &_convert_weights_managed));
     }
     else
     {
@@ -359,16 +359,16 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   bool is_fc_after_conv = true;
 
   const ITensorInfo &flatten_input = TensorInfo(input->clone()
-                                                    ->set_is_resizable(true)
-                                                    .reset_padding()
-                                                    .set_tensor_shape(compute_flatten_shape(input))
-                                                    .set_data_layout(DataLayout::NCHW));
+                                                  ->set_is_resizable(true)
+                                                  .reset_padding()
+                                                  .set_tensor_shape(compute_flatten_shape(input))
+                                                  .set_data_layout(DataLayout::NCHW));
   const ITensorInfo &reshaped_weights =
-      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          compute_transposed_shape(*weights)));
+    TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+      compute_transposed_shape(*weights)));
   const ITensorInfo &converted_weights =
-      weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
-                       : TensorInfo(*reshaped_weights.clone());
+    weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+                     : TensorInfo(*reshaped_weights.clone());
 
   // With the Fully Connected layer we can have 4 different cases:
   //  1) Convolution layer -> Fully Connected layer without batches
@@ -396,7 +396,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   {
     // Validate reshape weights kernel
     ARM_COMPUTE_RETURN_ON_ERROR(
-        CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights));
+      CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights));
     weights_to_use = &reshaped_weights;
   }
 
@@ -404,7 +404,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   {
     // Validate convert weights kernel
     ARM_COMPUTE_RETURN_ON_ERROR(CLConvertFullyConnectedWeights::validate(
-        weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
+      weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
     weights_to_use = &converted_weights;
   }
 
@@ -412,8 +412,8 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   {
     // Fully Connected layer after a Convolution Layer without batches
     ARM_COMPUTE_RETURN_ERROR_ON(
-        (weights_to_use->dimension(1) !=
-         (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+      (weights_to_use->dimension(1) !=
+       (input->dimension(0) * input->dimension(1) * input->dimension(2))));
 
     // Validate flatten kernel
     ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input));
@@ -427,7 +427,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
 
   // Validate matrix multiply kernel
   ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info));
+    validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info));
 
   return Status{};
 }
@@ -457,7 +457,7 @@ void CLFullyConnectedLayerEx::run()
       if (_weights_manager && _weights_manager->are_weights_managed(cur_weights))
       {
         _original_weights = utils::cast::polymorphic_downcast<ICLTensor *>(
-            _weights_manager->run(cur_weights, &_reshape_weights_managed_function));
+          _weights_manager->run(cur_weights, &_reshape_weights_managed_function));
       }
       else
       {
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
index 157b4d977..ac6982e6f 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
@@ -41,7 +41,7 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
     // reshape
     auto_init_if_empty(*_cl_buffer.info(),
                        _input->info()->clone()->set_tensor_shape(reshape).set_data_layout(
-                           _input->info()->data_layout()));
+                         _input->info()->data_layout()));
     _cl_reshape.configure(_input, &_cl_buffer);
     input_to_use = &_cl_buffer;
   }
@@ -57,7 +57,7 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
     {
       bool is_hybrid = (input->info()->data_type() == DataType::F32 ||
                         input->info()->data_type() == DataType::F16) &&
-                       (weights->info()->data_type() == DataType::S8 ||
+                       (weights->info()->data_type() == DataType::QSYMM8 ||
                         weights->info()->data_type() == DataType::QASYMM8_SIGNED);
 
       if (is_hybrid)
@@ -81,7 +81,6 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
     {
       throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type");
     }
-
   }();
 
   if (_needs_reshape)
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
index 02ee4ad8a..c246041bb 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -46,8 +46,8 @@
 using namespace arm_compute;
 
 CLReduceOperation::CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(),
-      _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape()
+  : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(),
+    _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape()
 {
 }
 
@@ -91,13 +91,13 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *
   for (size_t i = 0; i < num_of_kernels; ++i, ++it)
   {
     ARM_COMPUTE_RETURN_ON_ERROR(
-        CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op));
+      CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op));
   }
 
   if (!keep_dims)
   {
     ARM_COMPUTE_RETURN_ON_ERROR(
-        CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output));
+      CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output));
   }
 
   return Status{};
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
index a502f032e..12c0aa829 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
@@ -134,8 +134,8 @@ void configure_slices(const ICLTensor *input, const std::vector<ICLTensor *> &ou
     // Output auto inizialitation if not yet initialized
     TensorInfo tmp_output_info = *output->info()->clone();
     auto_init_if_empty(
-        tmp_output_info,
-        input->info()->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
+      tmp_output_info,
+      input->info()->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
 
     // Update coordinate on axis
     start_coords.set(split_dim, axis_offset);
@@ -153,7 +153,7 @@ void configure_slices(const ICLTensor *input, const std::vector<ICLTensor *> &ou
 } // namespace
 
 CLSplitVEx::CLSplitVEx()
-    : _input(nullptr), _size_splits(nullptr), _outputs(), _num_splits(0), _slice_functions()
+  : _input(nullptr), _size_splits(nullptr), _outputs(), _num_splits(0), _slice_functions()
 {
 }
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
index 3ac95a8e6..accd51302 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
@@ -49,14 +49,14 @@ namespace arm_compute
 {
 
 CLTopKV2::CLTopKV2()
-    : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0),
-      _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(),
-      _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(),
-      _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr),
-      _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(),
-       _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(),
-       _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(),
-       _reorder_negatives_kernel(), _store_kernel()*/
+  : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0),
+    _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(),
+    _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(),
+    _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr), _p_out_key_buf(nullptr),
+    _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(),
+    _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(),
+    _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(),
+    _reorder_negatives_kernel(), _store_kernel()*/
 {
 }
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
index 3215d01a7..0754fd813 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
@@ -53,7 +53,7 @@ using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
 
 CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_manager(std::move(memory_manager)), _function()
+  : _memory_manager(std::move(memory_manager)), _function()
 {
 }
 
@@ -105,20 +105,20 @@ Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
 {
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
   switch (CLTransposeConvLayer::get_deconvolution_method(
-      input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info))
+    input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info))
   {
     case DeconvolutionMethod::DIRECT:
     {
       // Validate direct convolution layer
       ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate(
-          input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info));
+        input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info));
       break;
     }
     case DeconvolutionMethod::GEMM:
     {
       // Validate gemm-based convolution layer
       ARM_COMPUTE_RETURN_ON_ERROR(
-          CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
+        CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
       break;
     }
     default:
@@ -130,9 +130,9 @@ Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
 }
 
 DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method(
-    const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias,
-    ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right,
-    unsigned int invalid_bottom, const WeightsInfo &weights_info)
+  const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias,
+  ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right,
+  unsigned int invalid_bottom, const WeightsInfo &weights_info)
 {
   ARM_COMPUTE_UNUSED(output, bias, weights_info);
 
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
index a123439d9..e212a03c7 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
@@ -58,7 +58,7 @@ namespace
 Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
 {
   ARM_COMPUTE_RETURN_ON_ERROR(
-      NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
+    NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
 
   return Status{};
 }
@@ -78,11 +78,11 @@ Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *in
 }
 
 NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer(
-    std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(),
-      _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(),
-      _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false),
-      _accumulate_biases(false), _is_prepared(false)
+  std::shared_ptr<IMemoryManager> memory_manager)
+  : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(),
+    _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(),
+    _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false),
+    _accumulate_biases(false), _is_prepared(false)
 {
 }
 
@@ -103,8 +103,8 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
 
   // Perform validate step
   ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedHybridLayer::validate(
-      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
-      fc_info));
+    input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+    fc_info));
 
   _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
   _accumulate_biases = false;
@@ -132,10 +132,10 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
   bool _is_fc_after_conv;
   if (is_batched_fc_layer)
   {
-    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
-                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                    input->info()->tensor_shape().cend(),
-                                    output->info()->tensor_shape().cbegin() + 1));
+    _is_fc_after_conv =
+      (TensorShape::num_max_dimensions >= 4) &&
+      (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(),
+                  output->info()->tensor_shape().cbegin() + 1));
   }
   else
   {
@@ -150,23 +150,23 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
   {
     // Reshape the weights
     _reshape_weights_output.allocator()->init(
-        weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-            compute_transposed_shape(*weights->info())));
+      weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+        compute_transposed_shape(*weights->info())));
     _reshape_weights_function.configure(weights_to_use, &_reshape_weights_output);
     weights_to_use = &_reshape_weights_output;
   }
 
   // Quantize input
   _quantized_input.allocator()->init(
-      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
-          DataType::QASYMM8_SIGNED));
+    input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+      DataType::QASYMM8_SIGNED));
   _scale_factor.allocator()->init(
-      TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32));
+    TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32));
   _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor);
 
   // GEMM
   _gemmlowp_output.allocator()->init(
-      output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+    output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
   configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output);
 
   // Multiply scale
@@ -195,8 +195,8 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
 
   const ITensorInfo &reshaped_weights =
-      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          compute_transposed_shape(*weights)));
+    TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+      compute_transposed_shape(*weights)));
 
   // Configure accumulate biases kernel for non quantized asymmetric types
   if (biases != nullptr)
@@ -217,7 +217,7 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   {
     // Validate reshape weights kernel
     ARM_COMPUTE_RETURN_ON_ERROR(
-        NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
+      NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
     weights_to_use = &reshaped_weights;
   }
 
@@ -225,20 +225,19 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
 
   // Validate quantization kernel
-  const ITensorInfo &quantized_input =
-      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
-          DataType::QASYMM8_SIGNED));
+  const ITensorInfo &quantized_input = TensorInfo(
+    input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::QASYMM8_SIGNED));
   const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32);
   ARM_COMPUTE_RETURN_ON_ERROR(
-      NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor));
+    NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor));
 
   const ITensorInfo &gemmlowp_output = TensorInfo(
-      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+    output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
   // Validate matrix multiply kernel
   ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output));
 
   ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate(
-      &gemmlowp_output, &scale_factor, output, weights->quantization_info().uniform().scale));
+    &gemmlowp_output, &scale_factor, output, weights->quantization_info().uniform().scale));
 
   return Status{};
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
index cb7557a5a..a639f2979 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
@@ -69,14 +69,14 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
     // Validate gemmlowp function
     ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(
-        &input.clone()->set_quantization_info(input_quantization_info),
-        &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output));
+      &input.clone()->set_quantization_info(input_quantization_info),
+      &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output));
   }
   else
   {
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(
-        &input, &weights, nullptr, &output, 1.f, 0.0f,
-        GEMMInfo(false, false, false /* Reshape weights only for the first run */)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+      NEGEMM::validate(&input, &weights, nullptr, &output, 1.f, 0.0f,
+                       GEMMInfo(false, false, false /* Reshape weights only for the first run */)));
   }
 
   return Status{};
@@ -84,12 +84,12 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 } // namespace
 
 NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(),
-      _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(),
-      _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(),
-      _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr),
-      _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false),
-      _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
+  : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(),
+    _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(),
+    _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), _converted_weights_output(),
+    _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true),
+    _are_weights_reshaped(false), _is_fc_after_conv(false), _accumulate_biases(false),
+    _is_quantized(false), _is_prepared(false)
 {
 }
 
@@ -105,9 +105,9 @@ void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *
     const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
 
     input->info()->set_quantization_info(QuantizationInfo(
-        input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+      input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
     weights->info()->set_quantization_info(QuantizationInfo(
-        weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+      weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
     // Configure gemmlowp function
     _mm_gemmlowp.configure(input, weights, nullptr, output);
@@ -129,8 +129,8 @@ void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITen
                                                 ITensor *output)
 {
   ARM_COMPUTE_ERROR_ON(
-      (weights->info()->dimension(1) !=
-       (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+    (weights->info()->dimension(1) !=
+     (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
 
   // If the fully connected layer is called after a convolution layer, the input tensor must be
   // linearized
@@ -138,8 +138,7 @@ void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITen
   // Initialize output tensor for flatten
   TensorShape shape_flatten = compute_flatten_shape(input->info());
   _flatten_output.allocator()->init(
-      input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          shape_flatten));
+    input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
 
   // Configure flatten kernel
   _memory_group.manage(&_flatten_output);
@@ -169,8 +168,8 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei
 
   // Perform validate step
   ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate(
-      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
-      fc_info));
+    input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+    fc_info));
 
   _are_weights_converted = true;
   _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
@@ -183,8 +182,7 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei
   if (_is_quantized)
   {
     _gemmlowp_output.allocator()->init(
-        output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
-            DataType::S32));
+      output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
   }
 
   // Configure accumulate biases kernel for non quantized asymmetric types
@@ -208,10 +206,10 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei
   const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
   if (is_batched_fc_layer)
   {
-    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
-                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                    input->info()->tensor_shape().cend(),
-                                    output->info()->tensor_shape().cbegin() + 1));
+    _is_fc_after_conv =
+      (TensorShape::num_max_dimensions >= 4) &&
+      (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(),
+                  output->info()->tensor_shape().cbegin() + 1));
   }
   else
   {
@@ -284,16 +282,16 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
 
   const ITensorInfo &flatten_input =
-      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          compute_flatten_shape(input)));
+    TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+      compute_flatten_shape(input)));
   const ITensorInfo &reshaped_weights =
-      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
-          compute_transposed_shape(*weights)));
+    TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+      compute_transposed_shape(*weights)));
   const ITensorInfo &converted_weights =
-      weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
-                       : TensorInfo(*reshaped_weights.clone());
+    weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+                     : TensorInfo(*reshaped_weights.clone());
   const ITensorInfo &gemmlowp_output = TensorInfo(
-      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+    output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
 
   // Configure accumulate biases kernel for non quantized asymmetric types
   if (biases != nullptr && !is_quantized)
@@ -330,7 +328,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   {
     // Validate reshape weights kernel
     ARM_COMPUTE_RETURN_ON_ERROR(
-        NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
+      NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
     weights_to_use = &reshaped_weights;
   }
 
@@ -338,7 +336,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   {
     // Validate convert weights kernel
     ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(
-        weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
+      weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
     weights_to_use = &converted_weights;
   }
 
@@ -346,8 +344,8 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   {
     // Fully Connected layer after a Convolution Layer without batches
     ARM_COMPUTE_RETURN_ERROR_ON(
-        (weights_to_use->dimension(1) !=
-         (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+      (weights_to_use->dimension(1) !=
+       (input->dimension(0) * input->dimension(1) * input->dimension(2))));
 
     // Validate flatten kernel
     ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input));
@@ -365,7 +363,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   if (is_quantized)
   {
     ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(
-        &gemmlowp_output, biases, output));
+      &gemmlowp_output, biases, output));
   }
 
   return Status{};
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
index dc6c78478..234c783f9 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
@@ -56,7 +56,7 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input
       assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
 
       bool is_hybrid = input->info()->data_type() == DataType::F32 &&
-                       (weights->info()->data_type() == DataType::S8 ||
+                       (weights->info()->data_type() == DataType::QSYMM8 ||
                         weights->info()->data_type() == DataType::QASYMM8_SIGNED);
 
       if (is_hybrid)
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
index 16d74e62d..451aa0997 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
@@ -46,9 +46,9 @@
 namespace arm_compute
 {
 NEInstanceNormalizationLayerEx::NEInstanceNormalizationLayerEx(
-    std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false),
-      _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
+  std::shared_ptr<IMemoryManager> memory_manager)
+  : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false),
+    _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
 {
 }
 
@@ -88,8 +88,8 @@ Status NEInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const
                                                 float epsilon)
 {
   return NEInstanceNormalizationLayerKernelEx::validate(
-      &input->clone()->set_data_layout(DataLayout::NCHW),
-      &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon);
+    &input->clone()->set_data_layout(DataLayout::NCHW),
+    &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon);
 }
 
 void NEInstanceNormalizationLayerEx::run()
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
index cb1a26304..c45c335b3 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
@@ -49,8 +49,8 @@
 using namespace arm_compute;
 
 NEReduceOperation::NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
-      _reduction_ops(), _keep_dims()
+  : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+    _reduction_ops(), _keep_dims()
 {
 }
 
@@ -125,7 +125,7 @@ void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_a
   for (unsigned int i = 0; i < _reduction_ops; ++i)
   {
     TensorShape out_shape =
-        i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+      i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
     out_shape.set(axis_local[i], 1);
     auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
 
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
index 26a887912..b21717e86 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
@@ -47,8 +47,8 @@
 using namespace arm_compute;
 
 NEReduceSum::NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
-      _reduction_ops(), _keep_dims()
+  : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+    _reduction_ops(), _keep_dims()
 {
 }
 
@@ -122,7 +122,7 @@ void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, b
   for (unsigned int i = 0; i < _reduction_ops; ++i)
   {
     TensorShape out_shape =
-        i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+      i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
     out_shape.set(axis_local[i], 1);
     auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
 
@@ -135,7 +135,7 @@ void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, b
       _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
                                                     input->info()->data_type(),
                                                     input->info()->quantization_info())
-                                             .set_data_layout(input->info()->data_layout()));
+                                           .set_data_layout(input->info()->data_layout()));
       _memory_group.manage(&_reduced_outs[i]);
       _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i],
                                       ReductionOperation::SUM);
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
index aa165cc15..50311071b 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
@@ -51,17 +51,9 @@ namespace arm_compute
 {
 
 NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _conv_f(),
-      _upsample_f(),
-      _flip_weights(),
-      _scaled_output(),
-      _weights_flipped(),
-      _flip_axis(),
-      _original_weights(nullptr),
-      _input(nullptr),
-      _info(),
-      _is_prepared(false)
+  : _memory_group(std::move(memory_manager)), _conv_f(), _upsample_f(), _flip_weights(),
+    _scaled_output(), _weights_flipped(), _flip_axis(), _original_weights(nullptr), _input(nullptr),
+    _info(), _is_prepared(false)
 {
 }
 
@@ -76,15 +68,15 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input);
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
   const unsigned int width_idx =
-      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+    get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
   const unsigned int height_idx =
-      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+    get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
   ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
   ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1);
 
   auto out_dims = transposeconv_output_dimensions(
-      input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx),
-      weights->dimension(height_idx), info, invalid_right, invalid_bottom);
+    input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx),
+    weights->dimension(height_idx), info, invalid_right, invalid_bottom);
 
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
   if (bias != nullptr)
@@ -117,24 +109,24 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
   unsigned int pad_right = 0;
   unsigned int pad_top = 0;
   unsigned int pad_bottom = 0;
-  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
-      pad_bottom);
+  const TensorShape scale_out_shape =
+    compute_transposeconv_upsampled_shape(*input, *weights, info, out_dims, invalid_right,
+                                          invalid_bottom, pad_left, pad_right, pad_top, pad_bottom);
   TensorInfo scale_out_info(
-      input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
+    input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
   const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
   const unsigned int batches_idx =
-      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
+    get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
   const unsigned int channel_idx =
-      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+    get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
   ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) !=
                               scale_out_info.dimension(batches_idx));
   ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) !=
                               scale_out_info.dimension(channel_idx));
 
-  ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output,
-                                                           conv_info, WeightsInfo()));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+    NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo()));
 
   return Status{};
 }
@@ -146,21 +138,21 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con
   // Perform validation step
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
   ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
-      input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(),
-      info, invalid_right, invalid_bottom));
+    input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(),
+    info, invalid_right, invalid_bottom));
 
   const DataLayout data_layout = input->info()->data_layout();
   const unsigned int width_idx =
-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
   const unsigned int height_idx =
-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
   auto out_dims = transposeconv_output_dimensions(
-      input->info()->dimension(width_idx), input->info()->dimension(height_idx),
-      weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info,
-      invalid_right, invalid_bottom);
+    input->info()->dimension(width_idx), input->info()->dimension(height_idx),
+    weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info,
+    invalid_right, invalid_bottom);
 
   const TensorShape output_shape =
-      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+    compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
 
   _input = input;
   _original_weights = weights;
@@ -188,8 +180,8 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con
   const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
   const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
-      pad_right, pad_top, pad_bottom);
+    *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+    pad_right, pad_top, pad_bottom);
 
   const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
                                     DimensionRoundingType::FLOOR);
diff --git a/compute/cker/include/cker/CpuBackendThreadpool.h b/compute/cker/include/cker/CpuBackendThreadpool.h
new file mode 100644
index 000000000..cc6a9dbfc
--- /dev/null
+++ b/compute/cker/include/cker/CpuBackendThreadpool.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_
+#define __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_
+
+#include <ruy/context.h>     // from @ruy
+#include <ruy/thread_pool.h> // from @ruy
+
+namespace nnfw
+{
+namespace cker
+{
+namespace cpu_backend_threadpool
+{
+
+using Task = ruy::Task;
+
+template <typename TaskType>
+void Execute(int tasks_count, TaskType *tasks, ruy::Context *ruy_context)
+{
+  assert(tasks_count <= ruy_context->max_num_threads());
+  ruy_context->mutable_thread_pool()->Execute(tasks_count, tasks);
+}
+
+} // namespace cpu_backend_threadpool
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_
diff --git a/compute/cker/include/cker/NeonTensorUtils.h b/compute/cker/include/cker/NeonTensorUtils.h
index e08040632..8bf0bee03 100644
--- a/compute/cker/include/cker/NeonTensorUtils.h
+++ b/compute/cker/include/cker/NeonTensorUtils.h
@@ -131,7 +131,7 @@ inline const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, co
   const int kWeightsPerUint32 = 4;
 
   int8 *shuffled_vectors = reinterpret_cast<int8 *>(
-      aligned_alloc(kWeightsPerUint32, n_batch * m_cols, shuffled_vectors_free));
+    aligned_alloc(kWeightsPerUint32, n_batch * m_cols, shuffled_vectors_free));
 
   for (int i = 0; i < n_batch; i += 4)
   {
@@ -145,25 +145,25 @@ inline const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, co
     while (unshuffled_vec0_ptr != end_vec0_ptr)
     {
       asm volatile(
-          // This code path requires that (n_cols % 16) == 0 so we can safely
-          // read in 16-byte chunks from each row.
-          "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n"
-          "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n"
-          "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n"
-          "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n"
-
-          "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n"
-          "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n"
-          "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n"
-          "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n"
-
-          : [unshuffled_vec0_ptr] "+r"(unshuffled_vec0_ptr),
-            [unshuffled_vec1_ptr] "+r"(unshuffled_vec1_ptr),
-            [unshuffled_vec2_ptr] "+r"(unshuffled_vec2_ptr),
-            [unshuffled_vec3_ptr] "+r"(unshuffled_vec3_ptr),
-            [shuffled_vectors_ptr] "+r"(shuffled_vectors_ptr)
-          :
-          : "v0", "v1", "v2", "v3", "cc", "memory");
+        // This code path requires that (n_cols % 16) == 0 so we can safely
+        // read in 16-byte chunks from each row.
+        "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n"
+        "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n"
+        "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n"
+        "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n"
+
+        "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n"
+        "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n"
+        "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n"
+        "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n"
+
+        : [ unshuffled_vec0_ptr ] "+r"(unshuffled_vec0_ptr),
+          [ unshuffled_vec1_ptr ] "+r"(unshuffled_vec1_ptr),
+          [ unshuffled_vec2_ptr ] "+r"(unshuffled_vec2_ptr),
+          [ unshuffled_vec3_ptr ] "+r"(unshuffled_vec3_ptr),
+          [ shuffled_vectors_ptr ] "+r"(shuffled_vectors_ptr)
+        :
+        : "v0", "v1", "v2", "v3", "cc", "memory");
     }
   }
 
@@ -204,104 +204,104 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t *__restr
       const int8 *mat_ptr3 = matrix + ((row + 3) * m_cols);
 
       asm volatile(
-          // Zero out the accumulator registers.
-          "dup v0.4s, wzr\n"
-          "dup v1.4s, wzr\n"
-          "dup v2.4s, wzr\n"
-          "dup v3.4s, wzr\n"
-
-          "1:\n" // batch_cols_loop
-
-          // Read 16 more bytes from a pair of matrix rows.
-          "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
-
-          // Prefetch two rows ahead.
-          "prfm pldl1strm, [%[mat_ptr2]]\n"
-          "prfm pldl1strm, [%[mat_ptr3]]\n"
-
-          // Read from input vectors 4 times; 64 bytes total.
-          // Each 16-byte register contains parts of 4 vectors; see the
-          // shuffle logic above.
-
-          // From Benoit, places to look in the future:
-          // - Move load instructions further from sdot
-          // - Switch loop use-then-reload
-          // - Do partial unrolling to use register space better
-          "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
-          ".word 0x4f8ce100  // sdot v0.4s, v8.16b, v12.4b[0]\n"
-          "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
-          ".word 0x4face121  // sdot v1.4s, v9.16b, v12.4b[1]\n"
-          "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
-          ".word 0x4f8ce940  // sdot v0.4s, v10.16b, v12.4b[2]\n"
-          "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
-          ".word 0x4face961  // sdot v1.4s, v11.16b, v12.4b[3]\n"
-
-          // Update prefetch pointers.
-          "add %[mat_ptr2], %[mat_ptr2], #16\n"
-          "add %[mat_ptr3], %[mat_ptr3], #16\n"
-
-          // Re-use those vectors for the next row as well.
-          "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
-          ".word 0x4f8de102  // sdot v2.4s, v8.16b, v13.4b[0]\n"
-          ".word 0x4fade123  // sdot v3.4s, v9.16b, v13.4b[1]\n"
-          ".word 0x4f8de942  // sdot v2.4s, v10.16b, v13.4b[2]\n"
-          ".word 0x4fade963  // sdot v3.4s, v11.16b, v13.4b[3]\n"
-
-          // If we're not done with these rows, continue.
-          "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
-          "bne 1b\n" // batch_cols_loop
-
-          // Done with the rows, sum the results.
-          "add v0.4s, v0.4s, v1.4s\n"
-          "add v2.4s, v2.4s, v3.4s\n"
-
-          // Convert the per-vector sums to floating point.
-          "scvtf v0.4s, v0.4s\n"
-          "scvtf v1.4s, v2.4s\n"
-
-          // Fetch scale factors.
-          "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
-
-          // Multiply scale factors times sums.
-          "fmul v0.4s, v4.4s, v0.4s\n"
-          "fmul v1.4s, v4.4s, v1.4s\n"
-
-          // Load previous result values.
-          // The result position is:
-          //   result[batch * m_rows + row]
-          // Here that is factored into:
-          //   result_ptr = result + row
-          //   *result_ptr = res[0]
-          //   (uint8*)result_ptr += (m_rows * sizeof(float))
-          //   *result_ptr = res[1]
-          //   ...
-          // Since we're reading two rows at a time, though, we read both
-          //   result[batch * m_rows + row]
-          // and
-          //   result[batch * m_rows + row + 1]
-          "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
-          "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
-          "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
-          "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
-
-          // Go back to the starting position (subtract wide_rows * 4).
-          "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
-
-          // Add previous result values.
-          "fadd v9.4s, v9.4s, v0.4s\n"
-          "fadd v10.4s, v10.4s, v1.4s\n"
-
-          // Store results.
-          "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
-          "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
-          "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
-          "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
-          : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr),
-            [result_ptr] "+r"(result_ptr), [mat_ptr2] "+r"(mat_ptr2), [mat_ptr3] "+r"(mat_ptr3)
-          : [mat_ptr0_end] "r"(mat_ptr0_end), [scaling_factors_ptr] "r"(scaling_factors_ptr),
-            [wide_rows] "r"(wide_rows)
-          : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-            "v13", "cc", "memory");
+        // Zero out the accumulator registers.
+        "dup v0.4s, wzr\n"
+        "dup v1.4s, wzr\n"
+        "dup v2.4s, wzr\n"
+        "dup v3.4s, wzr\n"
+
+        "1:\n" // batch_cols_loop
+
+        // Read 16 more bytes from a pair of matrix rows.
+        "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
+
+        // Prefetch two rows ahead.
+        "prfm pldl1strm, [%[mat_ptr2]]\n"
+        "prfm pldl1strm, [%[mat_ptr3]]\n"
+
+        // Read from input vectors 4 times; 64 bytes total.
+        // Each 16-byte register contains parts of 4 vectors; see the
+        // shuffle logic above.
+
+        // From Benoit, places to look in the future:
+        // - Move load instructions further from sdot
+        // - Switch loop use-then-reload
+        // - Do partial unrolling to use register space better
+        "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4f8ce100  // sdot v0.4s, v8.16b, v12.4b[0]\n"
+        "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4face121  // sdot v1.4s, v9.16b, v12.4b[1]\n"
+        "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4f8ce940  // sdot v0.4s, v10.16b, v12.4b[2]\n"
+        "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4face961  // sdot v1.4s, v11.16b, v12.4b[3]\n"
+
+        // Update prefetch pointers.
+        "add %[mat_ptr2], %[mat_ptr2], #16\n"
+        "add %[mat_ptr3], %[mat_ptr3], #16\n"
+
+        // Re-use those vectors for the next row as well.
+        "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
+        ".word 0x4f8de102  // sdot v2.4s, v8.16b, v13.4b[0]\n"
+        ".word 0x4fade123  // sdot v3.4s, v9.16b, v13.4b[1]\n"
+        ".word 0x4f8de942  // sdot v2.4s, v10.16b, v13.4b[2]\n"
+        ".word 0x4fade963  // sdot v3.4s, v11.16b, v13.4b[3]\n"
+
+        // If we're not done with these rows, continue.
+        "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
+        "bne 1b\n" // batch_cols_loop
+
+        // Done with the rows, sum the results.
+        "add v0.4s, v0.4s, v1.4s\n"
+        "add v2.4s, v2.4s, v3.4s\n"
+
+        // Convert the per-vector sums to floating point.
+        "scvtf v0.4s, v0.4s\n"
+        "scvtf v1.4s, v2.4s\n"
+
+        // Fetch scale factors.
+        "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
+
+        // Multiply scale factors times sums.
+        "fmul v0.4s, v4.4s, v0.4s\n"
+        "fmul v1.4s, v4.4s, v1.4s\n"
+
+        // Load previous result values.
+        // The result position is:
+        //   result[batch * m_rows + row]
+        // Here that is factored into:
+        //   result_ptr = result + row
+        //   *result_ptr = res[0]
+        //   (uint8*)result_ptr += (m_rows * sizeof(float))
+        //   *result_ptr = res[1]
+        //   ...
+        // Since we're reading two rows at a time, though, we read both
+        //   result[batch * m_rows + row]
+        // and
+        //   result[batch * m_rows + row + 1]
+        "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+
+        // Go back to the starting position (subtract wide_rows * 4).
+        "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
+
+        // Add previous result values.
+        "fadd v9.4s, v9.4s, v0.4s\n"
+        "fadd v10.4s, v10.4s, v1.4s\n"
+
+        // Store results.
+        "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+        : [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1), [ vec_ptr ] "+r"(vec_ptr),
+          [ result_ptr ] "+r"(result_ptr), [ mat_ptr2 ] "+r"(mat_ptr2), [ mat_ptr3 ] "+r"(mat_ptr3)
+        : [ mat_ptr0_end ] "r"(mat_ptr0_end), [ scaling_factors_ptr ] "r"(scaling_factors_ptr),
+          [ wide_rows ] "r"(wide_rows)
+        : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+          "v13", "cc", "memory");
     }
   }
 
@@ -309,9 +309,9 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t *__restr
 }
 
 static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
-    const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
-    const float *scaling_factors, int n_batch, float *__restrict__ result,
-    const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
+  const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
+  const float *scaling_factors, int n_batch, float *__restrict__ result,
+  const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
 {
   void *shuffled_vectors_free;
   const int8_t *shuffled_vectors = ShuffleVectors(vectors, n_batch, m_cols, &shuffled_vectors_free);
@@ -332,102 +332,102 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
       const int32_t *batch_offsets_ptr = input_offset + batch;
       const int32_t is_channel_scale_nullptr = per_channel_scale == nullptr;
       const int32_t is_row_sums_nullptr = row_sums_ptr == nullptr;
-      asm volatile("dup v0.4s, wzr\n"
-                   "dup v1.4s, wzr\n"
-                   "dup v2.4s, wzr\n"
-                   "dup v3.4s, wzr\n"
-                   // Load zero points.
-                   "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n"
-                   "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
-                   // Zero out zero point accumulators.
-                   "dup v14.4s, wzr\n"
-                   "dup v15.4s, wzr\n"
-
-                   // Load per channel scales if not null.
-                   "cmp %w[is_channel_scale_nullptr], #0\n"
-                   "bne 1f\n"
-                   "ld1r {v16.4s}, [%[channel_scales_ptr]], #4\n"
-                   "ld1r {v17.4s}, [%[channel_scales_ptr]]\n"
-                   "fmul v16.4s, v16.4s, v4.4s\n"
-                   "fmul v17.4s, v17.4s, v4.4s\n"
-                   "b 2f\n"
-                   "1:\n"
-                   "mov v16.16b, v4.16b\n"
-                   "mov v17.16b, v4.16b\n"
-                   "2:\n"
-                   "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
-                   "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
-                   ".word 0x4f8ce100  // sdot v0.4s, v8.16b, v12.4b[0]\n"
-                   "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
-                   ".word 0x4face121  // sdot v1.4s, v9.16b, v12.4b[1]\n"
-                   "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
-                   ".word 0x4f8ce940  // sdot v0.4s, v10.16b, v12.4b[2]\n"
-                   "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
-                   ".word 0x4face961  // sdot v1.4s, v11.16b, v12.4b[3]\n"
-                   "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
-                   ".word 0x4f8de102  // sdot v2.4s, v8.16b, v13.4b[0]\n"
-                   ".word 0x4fade123  // sdot v3.4s, v9.16b, v13.4b[1]\n"
-                   ".word 0x4f8de942  // sdot v2.4s, v10.16b, v13.4b[2]\n"
-                   ".word 0x4fade963  // sdot v3.4s, v11.16b, v13.4b[3]\n"
-                   "cmp %w[is_row_sums_nullptr], #1\n"
-                   "bne 3f\n"
-                   // Accumulate row_sums for zero point calculations.
-                   "saddlp v12.8h, v12.16b\n"
-                   "saddlp v13.8h, v13.16b\n"
-                   "sadalp v14.4s, v12.8h\n"
-                   "sadalp v15.4s, v13.8h\n"
-                   "3:\n"
-                   "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
-                   "bne 2b\n"
-                   "add v0.4s, v0.4s, v1.4s\n"
-                   "add v2.4s, v2.4s, v3.4s\n"
-
-                   "cmp %w[is_row_sums_nullptr], #1\n"
-                   "bne 4f\n"
-                   // Calculate zero point offsets.
-                   "addv s14, v14.4s\n"
-                   "addv s15, v15.4s\n"
-                   "dup v14.4s, v14.s[0]\n"
-                   "dup v15.4s, v15.s[0]\n"
-                   "b 5f\n"
-                   "4:\n"
-                   "ld1r {v14.4s}, [%[row_sums_ptr]], #4\n"
-                   "ld1r {v15.4s}, [%[row_sums_ptr]]\n"
-                   "5:\n"
-
-                   "mul v14.4s, v14.4s, v7.4s\n"
-                   "mul v15.4s, v15.4s, v7.4s\n"
-                   "sub v0.4s, v0.4s, v14.4s\n"
-                   "sub v2.4s, v2.4s, v15.4s\n"
-
-                   "scvtf v0.4s, v0.4s\n"
-                   "scvtf v1.4s, v2.4s\n"
-
-                   // Multiply scale.
-                   "fmul v0.4s, v16.4s, v0.4s\n"
-                   "fmul v1.4s, v17.4s, v1.4s\n"
-
-                   "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
-                   "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
-                   "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
-                   "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
-                   "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
-                   "fadd v9.4s, v9.4s, v0.4s\n"
-                   "fadd v10.4s, v10.4s, v1.4s\n"
-                   "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
-                   "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
-                   "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
-                   "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
-                   : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr),
-                     [result_ptr] "+r"(result_ptr), [row_sums_ptr] "+r"(row_sums_ptr)
-                   : [mat_ptr0_end] "r"(mat_ptr0_end),
-                     [scaling_factors_ptr] "r"(scaling_factors_ptr), [wide_rows] "r"(wide_rows),
-                     [channel_scales_ptr] "r"(channel_scales_ptr),
-                     [batch_offsets_ptr] "r"(batch_offsets_ptr),
-                     [is_channel_scale_nullptr] "r"(is_channel_scale_nullptr),
-                     [is_row_sums_nullptr] "r"(is_row_sums_nullptr)
-                   : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
-                     "v12", "v13", "v14", "v15", "v16", "v17", "w0", "w1", "cc", "memory");
+      asm volatile(
+        "dup v0.4s, wzr\n"
+        "dup v1.4s, wzr\n"
+        "dup v2.4s, wzr\n"
+        "dup v3.4s, wzr\n"
+        // Load zero points.
+        "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n"
+        "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
+        // Zero out zero point accumulators.
+        "dup v14.4s, wzr\n"
+        "dup v15.4s, wzr\n"
+
+        // Load per channel scales if not null.
+        "cmp %w[is_channel_scale_nullptr], #0\n"
+        "bne 1f\n"
+        "ld1r {v16.4s}, [%[channel_scales_ptr]], #4\n"
+        "ld1r {v17.4s}, [%[channel_scales_ptr]]\n"
+        "fmul v16.4s, v16.4s, v4.4s\n"
+        "fmul v17.4s, v17.4s, v4.4s\n"
+        "b 2f\n"
+        "1:\n"
+        "mov v16.16b, v4.16b\n"
+        "mov v17.16b, v4.16b\n"
+        "2:\n"
+        "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
+        "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4f8ce100  // sdot v0.4s, v8.16b, v12.4b[0]\n"
+        "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4face121  // sdot v1.4s, v9.16b, v12.4b[1]\n"
+        "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4f8ce940  // sdot v0.4s, v10.16b, v12.4b[2]\n"
+        "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
+        ".word 0x4face961  // sdot v1.4s, v11.16b, v12.4b[3]\n"
+        "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
+        ".word 0x4f8de102  // sdot v2.4s, v8.16b, v13.4b[0]\n"
+        ".word 0x4fade123  // sdot v3.4s, v9.16b, v13.4b[1]\n"
+        ".word 0x4f8de942  // sdot v2.4s, v10.16b, v13.4b[2]\n"
+        ".word 0x4fade963  // sdot v3.4s, v11.16b, v13.4b[3]\n"
+        "cmp %w[is_row_sums_nullptr], #1\n"
+        "bne 3f\n"
+        // Accumulate row_sums for zero point calculations.
+        "saddlp v12.8h, v12.16b\n"
+        "saddlp v13.8h, v13.16b\n"
+        "sadalp v14.4s, v12.8h\n"
+        "sadalp v15.4s, v13.8h\n"
+        "3:\n"
+        "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
+        "bne 2b\n"
+        "add v0.4s, v0.4s, v1.4s\n"
+        "add v2.4s, v2.4s, v3.4s\n"
+
+        "cmp %w[is_row_sums_nullptr], #1\n"
+        "bne 4f\n"
+        // Calculate zero point offsets.
+        "addv s14, v14.4s\n"
+        "addv s15, v15.4s\n"
+        "dup v14.4s, v14.s[0]\n"
+        "dup v15.4s, v15.s[0]\n"
+        "b 5f\n"
+        "4:\n"
+        "ld1r {v14.4s}, [%[row_sums_ptr]], #4\n"
+        "ld1r {v15.4s}, [%[row_sums_ptr]]\n"
+        "5:\n"
+
+        "mul v14.4s, v14.4s, v7.4s\n"
+        "mul v15.4s, v15.4s, v7.4s\n"
+        "sub v0.4s, v0.4s, v14.4s\n"
+        "sub v2.4s, v2.4s, v15.4s\n"
+
+        "scvtf v0.4s, v0.4s\n"
+        "scvtf v1.4s, v2.4s\n"
+
+        // Multiply scale.
+        "fmul v0.4s, v16.4s, v0.4s\n"
+        "fmul v1.4s, v17.4s, v1.4s\n"
+
+        "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+        "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+        "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
+        "fadd v9.4s, v9.4s, v0.4s\n"
+        "fadd v10.4s, v10.4s, v1.4s\n"
+        "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+        "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+        : [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1), [ vec_ptr ] "+r"(vec_ptr),
+          [ result_ptr ] "+r"(result_ptr), [ row_sums_ptr ] "+r"(row_sums_ptr)
+        : [ mat_ptr0_end ] "r"(mat_ptr0_end), [ scaling_factors_ptr ] "r"(scaling_factors_ptr),
+          [ wide_rows ] "r"(wide_rows), [ channel_scales_ptr ] "r"(channel_scales_ptr),
+          [ batch_offsets_ptr ] "r"(batch_offsets_ptr),
+          [ is_channel_scale_nullptr ] "r"(is_channel_scale_nullptr),
+          [ is_row_sums_nullptr ] "r"(is_row_sums_nullptr)
+        : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+          "v13", "v14", "v15", "v16", "v17", "w0", "w1", "cc", "memory");
     }
   }
 
@@ -458,9 +458,9 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
 // We don't use this kernel when n_batch = 1 because the baseline kernel
 // is fine for that case.
 inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
-    const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
-    const float *scaling_factors, int n_batch, float *__restrict__ result,
-    const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
+  const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
+  const float *scaling_factors, int n_batch, float *__restrict__ result,
+  const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
 {
   const int kWeightsPerUint32 = 4;
 
@@ -475,14 +475,14 @@ inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
   void *padded_vectors_free;
   const int padded_vectors_size = batch_round_up * m_cols;
   int8_t *padded_vectors = reinterpret_cast<int8_t *>(
-      aligned_alloc(kWeightsPerUint32, padded_vectors_size, &padded_vectors_free));
+    aligned_alloc(kWeightsPerUint32, padded_vectors_size, &padded_vectors_free));
   memset(padded_vectors, 0, padded_vectors_size);
 
   void *padded_result_free;
   const int result_size = n_batch * m_rows * sizeof(float);
   const int padded_result_size = batch_round_up * m_rows * sizeof(float);
   float *padded_result = reinterpret_cast<float *>(
-      aligned_alloc(kWeightsPerUint32, padded_result_size, &padded_result_free));
+    aligned_alloc(kWeightsPerUint32, padded_result_size, &padded_result_free));
   memcpy(padded_result, result, result_size);
   memset(reinterpret_cast<char *>(padded_result) + result_size, 0,
          padded_result_size - result_size);
@@ -494,7 +494,7 @@ inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
   void *padded_scaling_factors_free;
   const int padded_scaling_factors_size = batch_round_up * sizeof(float);
   float *padded_scaling_factors = reinterpret_cast<float *>(
-      aligned_alloc(kWeightsPerUint32, padded_scaling_factors_size, &padded_scaling_factors_free));
+    aligned_alloc(kWeightsPerUint32, padded_scaling_factors_size, &padded_scaling_factors_free));
   assert(static_cast<int>(n_batch * sizeof(float)) <= padded_scaling_factors_size);
   assert(static_cast<int>(batch_round_up * sizeof(float)) <= padded_scaling_factors_size);
   memset(padded_scaling_factors, 0, batch_round_up * sizeof(float));
@@ -505,7 +505,7 @@ inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
     void *padded_input_offset_free;
     const int padded_input_offset_size = batch_round_up * sizeof(int32_t);
     int32_t *padded_input_offset = reinterpret_cast<int32_t *>(
-        aligned_alloc(kWeightsPerUint32, padded_input_offset_size, &padded_input_offset_free));
+      aligned_alloc(kWeightsPerUint32, padded_input_offset_size, &padded_input_offset_free));
     assert(static_cast<int>(n_batch * sizeof(int32_t)) <= padded_input_offset_size);
     assert(static_cast<int>(batch_round_up * sizeof(int32_t)) <= padded_input_offset_size);
     memset(padded_input_offset, 0, batch_round_up * sizeof(int32_t));
@@ -513,8 +513,8 @@ inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
 
     // Call the main kernel.
     DotprodMatrixBatchFourVectorMultiplyAccumulate(
-        matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors, batch_round_up,
-        padded_result, per_channel_scale, padded_input_offset, row_sums);
+      matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors, batch_round_up, padded_result,
+      per_channel_scale, padded_input_offset, row_sums);
 
     free(padded_input_offset_free);
   }
@@ -533,13 +533,13 @@ inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
 }
 
 inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
-    const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
-    const float *scaling_factors, int n_batch, float *__restrict__ result)
+  const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
+  const float *scaling_factors, int n_batch, float *__restrict__ result)
 {
   DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
-      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
-      /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr,
-      /*row_sums=*/nullptr);
+    matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+    /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr,
+    /*row_sums=*/nullptr);
 }
 #endif // __aarch64__
 
@@ -736,7 +736,7 @@ inline void NeonSymmetricQuantizeFloats(const float *values, const int size,
   for (int i = postamble_start; i < size; ++i)
   {
     const int32_t quantized_value =
-        static_cast<int32_t>(std::round(scaling_factor_inv * values[i]));
+      static_cast<int32_t>(std::round(scaling_factor_inv * values[i]));
     quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
   }
 }
@@ -830,7 +830,7 @@ inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ m
         // Here the assumption is that each buffer is 4-byte aligned. Otherwise,
         // performance may suffer significantly.
         assert( // NOLINT
-            ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
+          ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
         const int8x16_t s1_8x16 = vld1q_s8((const int8_t *)(aligned_vec + col));
         const int8x16_t s2_8x16 = vld1q_s8((const int8_t *)(row_ptr + col));
         // Multiply the low bits (i.e. the lower 8 8bit numbers in the
@@ -855,7 +855,7 @@ inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ m
         // Here the assumption is that each buffer is 4-bytes aligned.
         // Otherwise, performance may suffer significantly.
         assert( // NOLINT
-            ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
+          ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
         const int8x8_t s1_8x8 = vld1_s8((const int8_t *)(aligned_vec + col));
         const int8x8_t s2_8x8 = vld1_s8((const int8_t *)(row_ptr + col));
         const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8);
@@ -952,7 +952,7 @@ inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ m
       const float32x4_t float_val1 = vcvtq_f32_s32(scratch_val1);
       const float32x4_t result0 = vmlaq_f32(vld1q_f32(result), float_val0, scaling_factor0);
       const float32x4_t result1 =
-          vmlaq_f32(vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1);
+        vmlaq_f32(vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1);
       vst1q_f32(result, result0);
       vst1q_f32(result + 4 * result_stride, result1);
     }
diff --git a/compute/cker/include/cker/PortableTensorUtils.h b/compute/cker/include/cker/PortableTensorUtils.h
index 3b3b27f72..2a58a2ec9 100644
--- a/compute/cker/include/cker/PortableTensorUtils.h
+++ b/compute/cker/include/cker/PortableTensorUtils.h
@@ -138,7 +138,7 @@ inline void PortableSymmetricQuantizeFloats(const float *values, const int size,
   for (int i = 0; i < size; ++i)
   {
     const int32_t quantized_value =
-        static_cast<int32_t>(std::round(values[i] * scaling_factor_inv));
+      static_cast<int32_t>(std::round(values[i] * scaling_factor_inv));
     // Clamp: just in case some odd numeric offset.
     quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
   }
diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h
index acb6cac55..10f3ecbd3 100644
--- a/compute/cker/include/cker/Types.h
+++ b/compute/cker/include/cker/Types.h
@@ -389,6 +389,11 @@ struct SpaceToDepthParams
   int32_t block_size;
 };
 
+struct LeakyReluParams
+{
+  float alpha;
+};
+
 enum class Order
 {
   kColMajor,
@@ -475,9 +480,9 @@ enum class QuantizationFlavor
 // (only those that need perchannel quantization do).
 template <typename AccumScalar, typename DstScalar,
           QuantizationFlavor quantization_flavor =
-              std::is_floating_point<AccumScalar>::value
-                  ? QuantizationFlavor::kFloatingPoint
-                  : QuantizationFlavor::kIntegerWithUniformMultiplier>
+            std::is_floating_point<AccumScalar>::value
+              ? QuantizationFlavor::kFloatingPoint
+              : QuantizationFlavor::kIntegerWithUniformMultiplier>
 struct GemmParams
 {
   // Only for non-floating-point cases. The fixed-point part (i.e. the mantissa)
@@ -504,12 +509,12 @@ struct GemmParams
   const AccumScalar *bias = nullptr;
   // min clamp bound of destination values.
   DstScalar clamp_min = std::is_floating_point<DstScalar>::value
-                            ? -std::numeric_limits<DstScalar>::infinity()
-                            : std::numeric_limits<DstScalar>::lowest();
+                          ? -std::numeric_limits<DstScalar>::infinity()
+                          : std::numeric_limits<DstScalar>::lowest();
   // max clamp bound of destination values.
   DstScalar clamp_max = std::is_floating_point<DstScalar>::value
-                            ? std::numeric_limits<DstScalar>::infinity()
-                            : std::numeric_limits<DstScalar>::max();
+                          ? std::numeric_limits<DstScalar>::infinity()
+                          : std::numeric_limits<DstScalar>::max();
 };
 
 // Validates self-consistency of GemmParams.
diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h
index 2abb998d0..f73c01523 100644
--- a/compute/cker/include/cker/Utils.h
+++ b/compute/cker/include/cker/Utils.h
@@ -88,8 +88,8 @@ inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multip
   int left_shift = shift > 0 ? shift : 0;
   int right_shift = shift > 0 ? 0 : -shift;
   return gemmlowp::RoundingDivideByPOT(
-      gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
-      right_shift);
+    gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
+    right_shift);
 }
 
 inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier,
@@ -103,7 +103,7 @@ inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x,
                                                               int left_shift)
 {
   return gemmlowp::RoundingDivideByPOT(
-      gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
+    gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
 }
 
 inline int NodeOffset(int b, int h, int w, int height, int width)
@@ -162,7 +162,7 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
   const F3 fixedpoint_input = F3::FromRaw(input >> 1);
   const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
   const F3 fixedpoint_half_three =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+    GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
   // Newton-Raphson iteration
   // Naive unoptimized starting guess: x = 1
   F3 x = F3::One();
@@ -173,7 +173,7 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
     x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
   }
   const F0 fixedpoint_half_sqrt_2 =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+    GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
   x = x * fixedpoint_half_sqrt_2;
   *output_inv_sqrt = x.raw();
   if (*output_shift < 0)
@@ -429,7 +429,7 @@ template <typename T> class SequentialTensorWriter
 {
 public:
   SequentialTensorWriter(const T *input_data, T *output_data)
-      : input_data_(input_data), output_ptr_(output_data)
+    : input_data_(input_data), output_ptr_(output_data)
   {
   }
 
diff --git a/compute/cker/include/cker/eigen/EigenSupport.h b/compute/cker/include/cker/eigen/EigenSupport.h
index 49c34211a..e3b10990e 100644
--- a/compute/cker/include/cker/eigen/EigenSupport.h
+++ b/compute/cker/include/cker/eigen/EigenSupport.h
@@ -39,17 +39,17 @@ namespace eigen_support
 // library.
 typedef Eigen::TensorMap<Eigen::Tensor<float, 2, Eigen::RowMajor, Eigen::DenseIndex>,
                          Eigen::Aligned>
-    EigenMatrix;
+  EigenMatrix;
 typedef Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor, Eigen::DenseIndex>,
                          Eigen::Aligned>
-    ConstEigenMatrix;
+  ConstEigenMatrix;
 
 typedef Eigen::TensorMap<Eigen::Tensor<float, 4, Eigen::RowMajor, Eigen::DenseIndex>,
                          Eigen::Aligned>
-    EigenTensor;
+  EigenTensor;
 typedef Eigen::TensorMap<Eigen::Tensor<const float, 4, Eigen::RowMajor, Eigen::DenseIndex>,
                          Eigen::Aligned>
-    ConstEigenTensor;
+  ConstEigenTensor;
 
 // Utility functions we need for the EigenTensor API.
 template <typename Device, typename T> struct MatMulConvFunctor
diff --git a/compute/cker/include/cker/eigen/Utils.h b/compute/cker/include/cker/eigen/Utils.h
index f9c706370..40cb85432 100644
--- a/compute/cker/include/cker/eigen/Utils.h
+++ b/compute/cker/include/cker/eigen/Utils.h
@@ -36,9 +36,9 @@ namespace cker
 //    Eigen::Map<Eigen::Matrix<const float, ...>>
 template <typename Scalar>
 using VectorMap = typename std::conditional<
-    std::is_const<Scalar>::value,
-    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, 1>>,
-    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type;
+  std::is_const<Scalar>::value,
+  Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, 1>>,
+  Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type;
 
 template <typename Scalar> VectorMap<Scalar> MapAsVector(Scalar *data, const Shape &shape)
 {
@@ -51,10 +51,10 @@ template <typename Scalar> VectorMap<Scalar> MapAsVector(Scalar *data, const Sha
 // above also applies here.
 template <typename Scalar>
 using MatrixMap = typename std::conditional<
-    std::is_const<Scalar>::value,
-    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic,
-                                   Eigen::Dynamic>>,
-    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
+  std::is_const<Scalar>::value,
+  Eigen::Map<
+    const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, Eigen::Dynamic>>,
+  Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
 
 template <typename Scalar>
 MatrixMap<Scalar> MapAsMatrixWithLastDimAsRows(Scalar *data, const Shape &shape)
diff --git a/compute/cker/include/cker/eigen/eigen_convolution_helpers.h b/compute/cker/include/cker/eigen/eigen_convolution_helpers.h
index dc3e2552d..9d4fd2eaf 100644
--- a/compute/cker/include/cker/eigen/eigen_convolution_helpers.h
+++ b/compute/cker/include/cker/eigen/eigen_convolution_helpers.h
@@ -49,20 +49,19 @@ class TensorEvaluatorHasPartialPacket
 public:
   template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
   static auto functionExistsSfinae(
-      typename std::enable_if<
-          unpacket_traits<PacketT>::masked_load_available &&
-          std::is_same<
-              PacketT,
-              decltype(std::declval<const TensorEvaluatorT>().template partialPacket<PacketT>(
-                  std::declval<IndexT>(),
-                  std::declval<typename unpacket_traits<PacketT>::mask_t>()))>::value>::type *)
-      -> std::true_type;
+    typename std::enable_if<
+      unpacket_traits<PacketT>::masked_load_available &&
+      std::is_same<PacketT,
+                   decltype(std::declval<const TensorEvaluatorT>().template partialPacket<PacketT>(
+                     std::declval<IndexT>(),
+                     std::declval<typename unpacket_traits<PacketT>::mask_t>()))>::value>::type *)
+    -> std::true_type;
 
   template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
   static auto functionExistsSfinae(...) -> std::false_type;
 
   typedef decltype(
-      functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(nullptr)) status;
+    functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(nullptr)) status;
 
   static constexpr bool value = status::value;
 };
@@ -71,9 +70,9 @@ public:
 // [from, to) range. If the mask bit is 1, element will be loaded/stored.
 template <typename Packet>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-    typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
-                            typename unpacket_traits<Packet>::mask_t>::type
-    mask(int from, int to)
+  typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
+                          typename unpacket_traits<Packet>::mask_t>::type
+  mask(int from, int to)
 {
   const Index packet_size = internal::unpacket_traits<Packet>::size;
   eigen_assert(0 <= from && to <= (packet_size + 1) && from < to);
diff --git a/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h b/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h
index 92e1614d1..c931ac518 100644
--- a/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h
+++ b/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h
@@ -62,30 +62,27 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
           typename Scalar_, typename Index, typename nocontract_t, typename contract_t, int Side,
           int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionInputMapper<
-    Scalar_, Index, Side,
-    TensorEvaluator<
-        const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-        Device>,
-    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+  Scalar_, Index, Side,
+  TensorEvaluator<
+    const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+  nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
 {
 public:
   typedef Scalar_ Scalar;
 
   typedef TensorContractionInputMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      Self;
+    Scalar, Index, Side,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    Self;
 
   typedef TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      SubMapper;
+    Scalar, Index, Side,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    SubMapper;
 
   typedef SubMapper VectorMapper;
   typedef SubMapper LinearMapper;
@@ -95,11 +92,11 @@ public:
 
   EIGEN_DEVICE_FUNC
   TensorContractionInputMapper(
-      const TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device> &tensor,
-      const nocontract_t &, const nocontract_t &, const contract_t &, const contract_t &)
-      : m_impl(tensor.impl().impl())
+    const TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>
+      &tensor,
+    const nocontract_t &, const nocontract_t &, const contract_t &, const contract_t &)
+    : m_impl(tensor.impl().impl())
   {
     Index patch_rows;
     Index patch_depth;
@@ -167,7 +164,7 @@ public:
 
   EIGEN_DEVICE_FUNC
   TensorContractionInputMapper(const TensorContractionInputMapper &base_mapper)
-      : m_impl(base_mapper.m_impl)
+    : m_impl(base_mapper.m_impl)
   {
     m_patch_cols = base_mapper.m_patch_cols;
     m_num_patches = base_mapper.m_num_patches;
@@ -280,11 +277,10 @@ public:
 
 private:
   friend class TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>;
+    Scalar, Index, Side,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>;
 
   // Load coefficient from a patch specified by the "within patch offset"
   // (patchId) and the precomputed indices of the first element of the patch.
@@ -298,14 +294,14 @@ private:
     const Index colOffset = patchOffset / m_fastColStride;
     const Index inputCol = colIndex + colOffset * m_in_col_strides;
     const Index origInputCol = (m_patch_col_inflate_strides == 1)
-                                   ? inputCol
-                                   : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+                                 ? inputCol
+                                 : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
 
     const Index rowOffset = patchOffset - colOffset * m_colStride;
     const Index inputRow = rowIndex + rowOffset * m_in_row_strides;
     const Index origInputRow = (m_patch_row_inflate_strides == 1)
-                                   ? inputRow
-                                   : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+                                 ? inputRow
+                                 : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
     if (origInputCol < 0 || origInputRow < 0 || origInputCol >= m_inputCols ||
         origInputRow >= m_inputRows || (inputCol != origInputCol * m_patch_col_inflate_strides) ||
         (inputRow != origInputRow * m_patch_row_inflate_strides))
@@ -314,7 +310,7 @@ private:
     }
     const Index depth = patchId - patchOffset * patchDepth();
     const Index inputIndex =
-        depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex;
+      depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex;
     return m_impl.coeff(inputIndex);
   }
 
@@ -338,7 +334,7 @@ private:
     }
     const Index depth = patchId - patchOffset * patchDepth();
     const Index inputIndex =
-        depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+      depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
     return m_impl.coeff(inputIndex);
   }
 
@@ -390,7 +386,7 @@ private:
       // span[0] all the way upto (and including) span[1].
       const Index depth = patchId - patchOffsets[0] * patchDepth();
       const Index inputIndex =
-          depth + inputRows[0] * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+        depth + inputRows[0] * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
       return m_impl.template partialPacket<Packet>(inputIndex - span[0],
                                                    mask<Packet>(span[0], span[1] + 1));
     }
@@ -445,10 +441,10 @@ private:
 
     // Load partial packets and do bit-wise OR to generate required packet
     return internal::por<Packet>(
-        loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[0], spans[0],
-                                  patchOffsets2Cols[0], colOffsets[0]),
-        loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[1], spans[1],
-                                  patchOffsets2Cols[1], colOffsets[1]));
+      loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[0], spans[0],
+                                patchOffsets2Cols[0], colOffsets[0]),
+      loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[1], spans[1],
+                                patchOffsets2Cols[1], colOffsets[1]));
   }
 
   // Helper function to load a packet that is present in a single columns.
@@ -477,7 +473,7 @@ private:
       // no padding
       const Index depth = patchId - patchOffsets[0] * patchDepth();
       const Index inputIndex =
-          depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex;
+        depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex;
       return m_impl.template packet<Unaligned>(inputIndex);
     }
     return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
@@ -490,7 +486,7 @@ private:
   // load.
   template <typename PacketT, typename TensorEvaluatorT>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
-      !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
+    !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
   loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const
   {
     const Index packetSize = internal::unpacket_traits<Packet>::size;
@@ -538,7 +534,7 @@ private:
   // packets.
   template <typename PacketT, typename TensorEvaluatorT>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
-      TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
+    TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
   loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const
   {
     const Index packetSize = internal::unpacket_traits<PacketT>::size;
@@ -604,7 +600,7 @@ private:
     // no padding
     const Index depth = patchId - patchOffset * patchDepth();
     const Index inputIndex =
-        depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+      depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
     return m_impl.template packet<Unaligned>(inputIndex);
   }
 
@@ -627,10 +623,10 @@ private:
   computeBaseIndices(Index patchIndex, Index &rowIndex, Index &colIndex, Index &otherIndex) const
   {
     const size_t NumInputDims =
-        array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+      array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
     otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches;
     const Index patch2DIndex =
-        (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches);
+      (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches);
     otherIndex *= m_patchInputStride;
     colIndex = patch2DIndex / m_fastOutputRows;
     rowIndex = patch2DIndex - colIndex * m_outputRows;
@@ -689,31 +685,28 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
           typename Scalar, typename Index, typename nocontract_t, typename contract_t, int Side,
           int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionSubMapper<
-    Scalar, Index, Side,
-    TensorEvaluator<
-        const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-        Device>,
-    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+  Scalar, Index, Side,
+  TensorEvaluator<
+    const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+  nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
 {
 public:
   typedef typename packet_traits<Scalar>::type Packet;
   typedef typename packet_traits<Scalar>::half HalfPacket;
 
   typedef TensorContractionInputMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      ParentMapper;
+    Scalar, Index, Side,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    ParentMapper;
 
   typedef TensorContractionSubMapper<
-      Scalar, Index, Side,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      Self;
+    Scalar, Index, Side,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    Self;
 
   typedef Self LinearMapper;
 
@@ -722,16 +715,16 @@ public:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const ParentMapper &base_mapper,
                                                                    Index vert_offset,
                                                                    Index horiz_offset)
-      : m_depth_offset(vert_offset), m_col_offset(horiz_offset), m_base_mapper(base_mapper)
+    : m_depth_offset(vert_offset), m_col_offset(horiz_offset), m_base_mapper(base_mapper)
   {
     m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const Self &base_mapper,
                                                                    Index vert_offset,
                                                                    Index horiz_offset)
-      : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
-        m_col_offset(horiz_offset + base_mapper.m_col_offset),
-        m_base_mapper(base_mapper.m_base_mapper)
+    : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
+      m_col_offset(horiz_offset + base_mapper.m_col_offset),
+      m_base_mapper(base_mapper.m_base_mapper)
   {
     m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
   }
@@ -766,7 +759,7 @@ public:
   {
     typedef decltype(m_base_mapper.m_impl) TensorEvaluatorT;
     return m_base_mapper.template loadPacketStandard<Packet, TensorEvaluatorT>(
-        i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
+      i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
   }
   template <typename Packet> EIGEN_DEVICE_FUNC bool aligned(Index) const { return false; }
 
@@ -781,7 +774,7 @@ public:
   EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const
   {
     const Index max_col =
-        (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) / fastPatchColStride();
+      (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) / fastPatchColStride();
     return std::min<Index>(1 + max_col, patchCols());
   }
 
@@ -789,8 +782,8 @@ public:
   EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k, const Index col) const
   {
     const Index max_row =
-        (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) - col * patchColStride()) /
-        fastPatchRowStride();
+      (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) - col * patchColStride()) /
+      fastPatchRowStride();
     return std::min<Index>(1 + max_row, patchRows());
   }
 
@@ -862,7 +855,7 @@ public:
   }
   template <typename PacketT = Packet>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
-      TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
+    TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
   partialPacketNoPadding(const Index depth, const Index baseIndex, Index num_coeffs) const
   {
     const Index inputIndex = depth + baseIndex;
@@ -913,8 +906,8 @@ public:
 
     const Index input_row = m_rowIndex + row * m_base_mapper.m_in_row_strides;
     *orig_row = (m_base_mapper.m_patch_row_inflate_strides == 1)
-                    ? input_row
-                    : ((input_row >= 0) ? (input_row / m_base_mapper.m_fastInputRowStride) : 0);
+                  ? input_row
+                  : ((input_row >= 0) ? (input_row / m_base_mapper.m_fastInputRowStride) : 0);
 
     return (*orig_row < 0 || *orig_row >= m_base_mapper.m_inputRows) ||
            (input_row != *orig_row * m_base_mapper.m_patch_row_inflate_strides);
@@ -932,8 +925,8 @@ public:
 
     const Index input_col = m_colIndex + col * m_base_mapper.m_in_col_strides;
     *orig_col = (m_base_mapper.m_patch_col_inflate_strides == 1)
-                    ? input_col
-                    : ((input_col >= 0) ? (input_col / m_base_mapper.m_fastInputColStride) : 0);
+                  ? input_col
+                  : ((input_col >= 0) ? (input_col / m_base_mapper.m_fastInputColStride) : 0);
 
     return (*orig_col < 0 || *orig_col >= m_base_mapper.m_inputCols) ||
            (input_col != *orig_col * m_base_mapper.m_patch_col_inflate_strides);
@@ -1033,23 +1026,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
           int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
           int nr>
 struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-            Device>,
-        nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered,
-        Alignment>,
-    nr, ColMajor, false, false>
+  Scalar, Index,
+  TensorContractionSubMapper<
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>,
+  nr, ColMajor, false, false>
 {
   typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      SubMapper;
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    SubMapper;
   typedef SubMapper DataMapper;
   typedef typename packet_traits<Scalar>::type Packet;
 
@@ -1159,7 +1149,7 @@ struct gemm_pack_rhs<
               const Index idx3 = dm3.baseIndex(r, c);
 
               const Index start_depth =
-                  ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
+                ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
               const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
               eigen_assert((max_depth - start_depth) % packet_size == 0);
 
@@ -1248,22 +1238,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
           typename Scalar, typename Index, typename nocontract_t, typename contract_t,
           bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-            Device>,
-        nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>,
-    nr, ColMajor, false, false>
+  Scalar, Index,
+  TensorContractionSubMapper<
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>,
+  nr, ColMajor, false, false>
 {
   typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      SubMapper;
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    SubMapper;
   typedef SubMapper DataMapper;
   typedef typename packet_traits<Scalar>::type Packet;
 
@@ -1378,7 +1366,7 @@ struct gemm_pack_rhs<
               const Index idx3 = dm3.baseIndex(r, c);
 
               const Index start_depth =
-                  ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
+                ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
               const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
               eigen_assert((max_depth - start_depth) % packet_size == 0);
 
@@ -1472,22 +1460,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
           typename Scalar, typename Index, typename nocontract_t, typename contract_t,
           bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
-    Scalar, Index,
-    TensorContractionSubMapper<
-        Scalar, Index, Rhs,
-        TensorEvaluator<
-            const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-            Device>,
-        nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>,
-    nr, ColMajor, false, false>
+  Scalar, Index,
+  TensorContractionSubMapper<
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>,
+  nr, ColMajor, false, false>
 {
   typedef TensorContractionSubMapper<
-      Scalar, Index, Rhs,
-      TensorEvaluator<
-          const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
-          Device>,
-      nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>
-      SubMapper;
+    Scalar, Index, Rhs,
+    TensorEvaluator<
+      const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+    nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>
+    SubMapper;
   typedef SubMapper DataMapper;
 
   EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -1582,27 +1568,25 @@ struct gemm_pack_rhs<
  */
 template <typename Input, typename Kernel, typename OutputKernel = const NoOpOutputKernel>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename internal::conditional<
-    internal::traits<Input>::Layout == ColMajor,
-    TensorReshapingOp<
-        const DSizes<typename internal::traits<Input>::Index,
-                     internal::traits<Input>::NumDimensions>,
-        const TensorContractionOp<
-            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
-            const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
-                                    const Kernel>,
-            const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
-                                    const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
-            const OutputKernel>>,
-    TensorReshapingOp<
-        const DSizes<typename internal::traits<Input>::Index,
-                     internal::traits<Input>::NumDimensions>,
-        const TensorContractionOp<
-            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
-            const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
-                                    const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
-            const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
-                                    const Kernel>,
-            const OutputKernel>>>::type
+  internal::traits<Input>::Layout == ColMajor,
+  TensorReshapingOp<
+    const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>,
+    const TensorContractionOp<
+      const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+      const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+                              const Kernel>,
+      const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+                              const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
+      const OutputKernel>>,
+  TensorReshapingOp<
+    const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>,
+    const TensorContractionOp<
+      const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+      const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+                              const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
+      const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+                              const Kernel>,
+      const OutputKernel>>>::type
 SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_stride = 1,
                    const Index col_stride = 1, const PaddingType padding_type = PADDING_SAME,
                    const Index row_in_stride = 1, const Index col_in_stride = 1,
@@ -1612,11 +1596,11 @@ SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_str
   typedef typename internal::traits<Input>::Index TensorIndex;
   TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions,
                    internal::traits<Input>::Layout, TensorIndex>>
-      in(input);
+    in(input);
   TensorRef<
-      Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions,
-             internal::traits<Kernel>::Layout, TensorIndex>>
-      kern(kernel);
+    Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions,
+           internal::traits<Kernel>::Layout, TensorIndex>>
+    kern(kernel);
 
   EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout,
                       YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -1735,46 +1719,46 @@ SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_str
   }
   if (padding_explicit)
   {
-    return choose(
-        Cond<internal::traits<Input>::Layout == ColMajor>(),
-        kernel.reshape(kernel_dims)
-            .contract(input
-                          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
-                                                 row_in_stride, col_in_stride,
-                                                 /*row_inflate_stride=*/1,
-                                                 /*col_inflate_stride=*/1, padding_top,
-                                                 padding_bottom, padding_left, padding_right,
-                                                 /*padding_value=*/0)
-                          .reshape(pre_contract_dims),
-                      contract_dims, output_kernel)
-            .reshape(post_contract_dims),
-        input
-            .extract_image_patches(
-                kernelRows, kernelCols, row_stride, col_stride, row_in_stride, col_in_stride,
-                /*row_inflate_stride=*/1,
-                /*col_inflate_stride=*/1, padding_top, padding_bottom, padding_left, padding_right,
-                /*padding_value=*/0)
-            .reshape(pre_contract_dims)
-            .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
-            .reshape(post_contract_dims));
+    return choose(Cond<internal::traits<Input>::Layout == ColMajor>(),
+                  kernel.reshape(kernel_dims)
+                    .contract(input
+                                .extract_image_patches(kernelRows, kernelCols, row_stride,
+                                                       col_stride, row_in_stride, col_in_stride,
+                                                       /*row_inflate_stride=*/1,
+                                                       /*col_inflate_stride=*/1, padding_top,
+                                                       padding_bottom, padding_left, padding_right,
+                                                       /*padding_value=*/0)
+                                .reshape(pre_contract_dims),
+                              contract_dims, output_kernel)
+                    .reshape(post_contract_dims),
+                  input
+                    .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+                                           row_in_stride, col_in_stride,
+                                           /*row_inflate_stride=*/1,
+                                           /*col_inflate_stride=*/1, padding_top, padding_bottom,
+                                           padding_left, padding_right,
+                                           /*padding_value=*/0)
+                    .reshape(pre_contract_dims)
+                    .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
+                    .reshape(post_contract_dims));
   }
   else
   {
     return choose(
-        Cond<internal::traits<Input>::Layout == ColMajor>(),
-        kernel.reshape(kernel_dims)
-            .contract(input
-                          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
-                                                 row_in_stride, col_in_stride, padding_type)
-                          .reshape(pre_contract_dims),
-                      contract_dims, output_kernel)
-            .reshape(post_contract_dims),
-        input
-            .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, row_in_stride,
-                                   col_in_stride, padding_type)
-            .reshape(pre_contract_dims)
-            .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
-            .reshape(post_contract_dims));
+      Cond<internal::traits<Input>::Layout == ColMajor>(),
+      kernel.reshape(kernel_dims)
+        .contract(input
+                    .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+                                           row_in_stride, col_in_stride, padding_type)
+                    .reshape(pre_contract_dims),
+                  contract_dims, output_kernel)
+        .reshape(post_contract_dims),
+      input
+        .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, row_in_stride,
+                               col_in_stride, padding_type)
+        .reshape(pre_contract_dims)
+        .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
+        .reshape(post_contract_dims));
   }
 }
 
diff --git a/compute/cker/include/cker/operation/AveragePool.h b/compute/cker/include/cker/operation/AveragePool.h
index 6149cafa7..a70e39cc9 100644
--- a/compute/cker/include/cker/operation/AveragePool.h
+++ b/compute/cker/include/cker/operation/AveragePool.h
@@ -73,10 +73,10 @@ void AveragePool<float>(const PoolParams &params, const Shape &input_shape, cons
         int hpad = h + params.padding_values.height;
         int wpad = w + params.padding_values.width;
         int h_start =
-            (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
+          (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
         int h_end = std::min(hpad / stride_height + 1, output_height);
         int w_start =
-            (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
+          (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
         int w_end = std::min(wpad / stride_width + 1, output_width);
         // compute elementwise sum
         for (int ph = h_start; ph < h_end; ++ph)
@@ -146,11 +146,11 @@ inline void AveragePool16(const PoolParams &params, const Shape &input_shape,
           const int filter_y_start = std::max(0, -in_y_origin);
           const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
           const int filter_count =
-              (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+            (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
           memset(acc, 0, tranche_depth * sizeof(acc[0]));
           const uint8_t *input_ptr =
-              input_data + depth_base +
-              depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+            input_data + depth_base +
+            depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
           for (int fy = filter_y_start; fy < filter_y_end; fy++)
           {
             const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
@@ -283,11 +283,11 @@ inline void AveragePool32(const PoolParams &params, const Shape &input_shape,
           const int filter_y_start = std::max(0, -in_y_origin);
           const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
           const int filter_count =
-              (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+            (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
           memset(acc, 0, tranche_depth * sizeof(acc[0]));
           const uint8_t *input_ptr =
-              input_data + depth_base +
-              depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+            input_data + depth_base +
+            depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
           for (int fy = filter_y_start; fy < filter_y_end; fy++)
           {
             const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
diff --git a/compute/cker/include/cker/operation/BatchToSpaceND.h b/compute/cker/include/cker/operation/BatchToSpaceND.h
index e33b2fba5..980ad48dd 100644
--- a/compute/cker/include/cker/operation/BatchToSpaceND.h
+++ b/compute/cker/include/cker/operation/BatchToSpaceND.h
@@ -43,7 +43,7 @@ inline void GetIndexRange(int spatial_index_dim, int block_shape_dim, int input_
   // Similarly, (*end_index) * block_shape_dim is rounded up too (note that
   // end_index is exclusive).
   *end_index =
-      std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
+    std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
 }
 
 template <typename T>
@@ -116,7 +116,7 @@ inline void BatchToSpaceND(const Shape &unextended_input1_shape, const T *input1
       for (int in_w = in_w_start; in_w < in_w_end; ++in_w)
       {
         const int out_w =
-            in_w * block_shape_width + spatial_offset % block_shape_width - crops_left;
+          in_w * block_shape_width + spatial_offset % block_shape_width - crops_left;
         assert(out_w >= 0);
         assert(out_w < output_width);
         T *out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
diff --git a/compute/cker/include/cker/operation/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
index d9917a9da..fe5f87746 100644
--- a/compute/cker/include/cker/operation/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
@@ -139,7 +139,7 @@ inline bool ProcessBroadcastShapes(const Shape &shape0, const Shape &shape1,
   // From this point it is assumed contractually that corresponding dimensions
   // in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
   const bool swap_inputs =
-      params->broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast;
+    params->broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast;
   const Shape *shape_a = swap_inputs ? &extended_shape1 : &extended_shape0;
   const Shape *shape_b = swap_inputs ? &extended_shape0 : &extended_shape1;
 
@@ -281,8 +281,8 @@ inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const S
       break;
     case nnfw::cker::BinaryArithmeticOpType::MUL:
       optimized::BroadcastMulDispatchQuant8(
-          params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape,
-          const_cast<uint8_t *>(input2_data), output_shape, output_data);
+        params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape,
+        const_cast<uint8_t *>(input2_data), output_shape, output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::DIV:
     case nnfw::cker::BinaryArithmeticOpType::POW:
@@ -320,8 +320,8 @@ inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const S
       break;
     case nnfw::cker::BinaryArithmeticOpType::POW:
       reference::BroadcastBinaryArithmeticOpSlow<float>(
-          params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-          GetBinaryArtithmeticFn<op_type, float>());
+        params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+        GetBinaryArtithmeticFn<op_type, float>());
       break;
     default:
       assert(false);
diff --git a/compute/cker/include/cker/operation/Common.h b/compute/cker/include/cker/operation/Common.h
index d69b38aca..24d4cc4c7 100644
--- a/compute/cker/include/cker/operation/Common.h
+++ b/compute/cker/include/cker/operation/Common.h
@@ -82,7 +82,7 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const
     for (; i < bias_size; i++)
     {
       array_ptr[i] =
-          ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max);
+        ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max);
     }
   }
 #else // not NEON
@@ -91,7 +91,7 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const
     for (int i = 0; i < bias_size; i++)
     {
       array_data[array_offset + i] = ActivationFunctionWithMinMax(
-          array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
+        array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
     }
   }
 #endif
diff --git a/compute/cker/include/cker/operation/Comparison.h b/compute/cker/include/cker/operation/Comparison.h
index 47eb6034c..ac6af8487 100644
--- a/compute/cker/include/cker/operation/Comparison.h
+++ b/compute/cker/include/cker/operation/Comparison.h
@@ -42,7 +42,7 @@ inline void ComparisonImpl(const Shape &input1_shape, const T *input1_data,
                            const Shape &output_shape, bool *output_data)
 {
   const int64_t flatsize = // number of data....
-      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+    MatchingFlatSize(input1_shape, input2_shape, output_shape);
   for (int64_t i = 0; i < flatsize; ++i)
   {
     output_data[i] = F(input1_data[i], input2_data[i]);
@@ -79,9 +79,9 @@ inline void ComparisonWithScaling(ComparisonParams &params, const Shape &input1_
     const int32_t shifted_input1_val = input1_val * (1 << left_shift);
     const int32_t shifted_input2_val = input2_val * (1 << left_shift);
     const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-        shifted_input1_val, input1_multiplier, input1_shift);
+      shifted_input1_val, input1_multiplier, input1_shift);
     const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-        shifted_input2_val, input2_multiplier, input2_shift);
+      shifted_input2_val, input2_multiplier, input2_shift);
     output_data[i] = F(scaled_input1_val, scaled_input2_val);
   }
 }
@@ -111,8 +111,8 @@ BroadcastComparison4DSlowImpl(const Shape &unextended_input1_shape, const T *inp
         for (int c = 0; c < output_shape.Dims(3); ++c)
         {
           output_data[Offset(output_shape, b, y, x, c)] =
-              F(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-                input2_data[SubscriptToIndex(desc2, b, y, x, c)]);
+            F(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)]);
         }
       }
     }
@@ -159,15 +159,15 @@ inline void BroadcastComparison4DSlowWithScaling(ComparisonParams &params,
         for (int c = 0; c < output_shape.Dims(3); ++c)
         {
           const int32_t input1_val =
-              input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+            input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)];
           const int32_t input2_val =
-              input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+            input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)];
           const int32_t shifted_input1_val = input1_val * (1 << left_shift);
           const int32_t shifted_input2_val = input2_val * (1 << left_shift);
           const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-              shifted_input1_val, input1_multiplier, input1_shift);
+            shifted_input1_val, input1_multiplier, input1_shift);
           const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-              shifted_input2_val, input2_multiplier, input2_shift);
+            shifted_input2_val, input2_multiplier, input2_shift);
           output_data[Offset(output_shape, b, y, x, c)] = F(scaled_input1_val, scaled_input2_val);
         }
       }
@@ -175,55 +175,53 @@ inline void BroadcastComparison4DSlowWithScaling(ComparisonParams &params,
   }
 }
 
-#define TFLITE_COMPARISON_OP(name)                                                                \
-  template <typename T>                                                                           \
-  inline void name(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape,    \
-                   const T *input2_data, const Shape &output_shape, bool *output_data)            \
-  {                                                                                               \
-    Comparison<name##Fn>(input1_shape, input1_data, input2_shape, input2_data, output_shape,      \
-                         output_data);                                                            \
-  }                                                                                               \
-  template <typename T>                                                                           \
-  inline void name##NoScaling(const Shape &input1_shape, const T *input1_data,                    \
-                              const Shape &input2_shape, const T *input2_data,                    \
-                              const Shape &output_shape, bool *output_data)                       \
-  {                                                                                               \
-    ComparisonImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data,             \
-                                output_shape, output_data);                                       \
-  }                                                                                               \
-  template <typename T>                                                                           \
-  inline void name##WithScaling(ComparisonParams &params, const Shape &input1_shape,              \
-                                const T *input1_data, const Shape &input2_shape,                  \
-                                const T *input2_data, const Shape &output_shape,                  \
-                                bool *output_data)                                                \
-  {                                                                                               \
-    ComparisonWithScaling<T, name##Fn>(params, input1_shape, input1_data, input2_shape,           \
-                                       input2_data, output_shape, output_data);                   \
-  }                                                                                               \
-  template <typename T>                                                                           \
-  inline void Broadcast4DSlow##name##NoScaling(const Shape &input1_shape, const T *input1_data,   \
-                                               const Shape &input2_shape, const T *input2_data,   \
-                                               const Shape &output_shape, bool *output_data)      \
-  {                                                                                               \
-    BroadcastComparison4DSlowImpl<T, name##Fn>(input1_shape, input1_data, input2_shape,           \
-                                               input2_data, output_shape, output_data);           \
-  }                                                                                               \
-  template <typename T>                                                                           \
-  inline void Broadcast4DSlow##name(const Shape &input1_shape, const T *input1_data,              \
-                                    const Shape &input2_shape, const T *input2_data,              \
-                                    const Shape &output_shape, bool *output_data)                 \
-  {                                                                                               \
-    BroadcastComparison4DSlow<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data,  \
-                                           output_shape, output_data);                            \
-  }                                                                                               \
-  template <typename T>                                                                           \
-  inline void Broadcast4DSlow##name##WithScaling(ComparisonParams &params,                        \
-                                                 const Shape &input1_shape, const T *input1_data, \
-                                                 const Shape &input2_shape, const T *input2_data, \
-                                                 const Shape &output_shape, bool *output_data)    \
-  {                                                                                               \
-    BroadcastComparison4DSlowWithScaling<T, name##Fn>(                                            \
-        params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); \
+#define TFLITE_COMPARISON_OP(name)                                                                 \
+  template <typename T>                                                                            \
+  inline void name(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape,     \
+                   const T *input2_data, const Shape &output_shape, bool *output_data)             \
+  {                                                                                                \
+    Comparison<name##Fn>(input1_shape, input1_data, input2_shape, input2_data, output_shape,       \
+                         output_data);                                                             \
+  }                                                                                                \
+  template <typename T>                                                                            \
+  inline void name##NoScaling(const Shape &input1_shape, const T *input1_data,                     \
+                              const Shape &input2_shape, const T *input2_data,                     \
+                              const Shape &output_shape, bool *output_data)                        \
+  {                                                                                                \
+    ComparisonImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data,              \
+                                output_shape, output_data);                                        \
+  }                                                                                                \
+  template <typename T>                                                                            \
+  inline void name##WithScaling(                                                                   \
+    ComparisonParams &params, const Shape &input1_shape, const T *input1_data,                     \
+    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, bool *output_data) \
+  {                                                                                                \
+    ComparisonWithScaling<T, name##Fn>(params, input1_shape, input1_data, input2_shape,            \
+                                       input2_data, output_shape, output_data);                    \
+  }                                                                                                \
+  template <typename T>                                                                            \
+  inline void Broadcast4DSlow##name##NoScaling(const Shape &input1_shape, const T *input1_data,    \
+                                               const Shape &input2_shape, const T *input2_data,    \
+                                               const Shape &output_shape, bool *output_data)       \
+  {                                                                                                \
+    BroadcastComparison4DSlowImpl<T, name##Fn>(input1_shape, input1_data, input2_shape,            \
+                                               input2_data, output_shape, output_data);            \
+  }                                                                                                \
+  template <typename T>                                                                            \
+  inline void Broadcast4DSlow##name(const Shape &input1_shape, const T *input1_data,               \
+                                    const Shape &input2_shape, const T *input2_data,               \
+                                    const Shape &output_shape, bool *output_data)                  \
+  {                                                                                                \
+    BroadcastComparison4DSlow<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data,   \
+                                           output_shape, output_data);                             \
+  }                                                                                                \
+  template <typename T>                                                                            \
+  inline void Broadcast4DSlow##name##WithScaling(                                                  \
+    ComparisonParams &params, const Shape &input1_shape, const T *input1_data,                     \
+    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, bool *output_data) \
+  {                                                                                                \
+    BroadcastComparison4DSlowWithScaling<T, name##Fn>(                                             \
+      params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data);    \
   }
 
 TFLITE_COMPARISON_OP(Equal);
diff --git a/compute/cker/include/cker/operation/Concatenation.h b/compute/cker/include/cker/operation/Concatenation.h
index 394123e30..9aaca00b7 100644
--- a/compute/cker/include/cker/operation/Concatenation.h
+++ b/compute/cker/include/cker/operation/Concatenation.h
@@ -142,7 +142,7 @@ inline void ConcatenationWithScaling(const ConcatenationParams &params,
         for (int j = 0; j < copy_size; ++j)
         {
           const int32_t value =
-              static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
+            static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
           output_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, value), 0));
         }
       }
diff --git a/compute/cker/include/cker/operation/DepthToSpace.h b/compute/cker/include/cker/operation/DepthToSpace.h
new file mode 100644
index 000000000..e57fef01d
--- /dev/null
+++ b/compute/cker/include/cker/operation/DepthToSpace.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_DEPTH_TO_SPACE_H__
+#define __NNFW_CKER_DEPTH_TO_SPACE_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void DepthToSpace(const Shape &unextended_input_shape, const T *input_data,
+                         const Shape &unextended_output_shape, T *output_data, int32_t block_size)
+{
+  assert(unextended_input_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  const int input_depth = input_shape.Dims(3);
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+
+  const int output_depth = output_shape.Dims(3);
+  const int batch_size = output_shape.Dims(0);
+
+  // Number of continuous values that we can copy in one interation.
+  const int stride = block_size * output_depth;
+
+  for (int batch = 0; batch < batch_size; ++batch)
+  {
+    for (int in_h = 0; in_h < input_height; ++in_h)
+    {
+      const T *input_ptr = input_data + Offset(input_shape, batch, in_h, 0, 0);
+      for (int offset_h = 0; offset_h < block_size; ++offset_h)
+      {
+        const T *src = input_ptr;
+        for (int in_w = 0; in_w < input_width; ++in_w)
+        {
+          memcpy(output_data, src, stride * sizeof(T));
+          output_data += stride;
+          src += input_depth;
+        }
+        input_ptr += stride;
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SPACE_TO_DEPTH_H__
diff --git a/compute/cker/include/cker/operation/DepthwiseConv.h b/compute/cker/include/cker/operation/DepthwiseConv.h
index 814a9e019..436ddd8c9 100644
--- a/compute/cker/include/cker/operation/DepthwiseConv.h
+++ b/compute/cker/include/cker/operation/DepthwiseConv.h
@@ -22,143 +22,159 @@
 #include "cker/Types.h"
 #include "cker/Utils.h"
 #include "cker/neon/neon_check.h"
+#include "cker/operation/optimized/DepthwiseConvFloat.h"
 #include "cker/operation/optimized/DepthwiseConvUint8.h"
+#include "cker/CpuBackendThreadpool.h"
 
 namespace nnfw
 {
 namespace cker
 {
 
-inline void DepthwiseConv(const DepthwiseConvParams &params, const Shape &input_shape,
-                          const uint8_t *input_data, const Shape &filter_shape,
-                          const uint8_t *filter_data, const Shape &bias_shape,
-                          const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data)
+// TODO(luwa): add multithread to per-channel depthwise_conv
+// DepthwiseConv can run with multi threads on the dim specified by thread_dim.
+// Each thread processes output elements on dim, thread_dim, in the range of
+// [thread_start, thread_end).
+// For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
+// means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
+template <typename T, typename TS> struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task
 {
-  const int depth_multiplier = params.depth_multiplier;
-  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32_t output_activation_max = params.quantized_activation_max;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  assert(dilation_width_factor >= 1);
-  assert(dilation_height_factor >= 1);
-  UNUSED_RELEASE(dilation_width_factor);
-  UNUSED_RELEASE(dilation_height_factor);
-  assert(input_shape.DimensionsCount() == 4);
-  assert(filter_shape.DimensionsCount() == 4);
-  assert(output_shape.DimensionsCount() == 4);
-  assert(output_activation_min <= output_activation_max);
-  UNUSED_RELEASE(output_activation_min);
-  UNUSED_RELEASE(output_activation_max);
-  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_depth = input_shape.Dims(3);
-  assert(output_depth == input_depth * depth_multiplier);
-  assert(bias_shape.FlatSize() == output_depth);
-  UNUSED_RELEASE(input_depth);
-  UNUSED_RELEASE(output_depth);
-  UNUSED_RELEASE(depth_multiplier);
-
-// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
-// Jetson TX-2. This compiler does not support the offsetof() macro.
-#if defined(__aarch64__)
-//  TODO Use below codes
-
-//  const int stride_width = params.stride_width;
-//  const int stride_height = params.stride_height;
-//  const int pad_width = params.padding_values.width;
-//  const int pad_height = params.padding_values.height;
-//  const int output_shift = params.output_shift;
-//
-//  // Call kernel optimized for depthwise convolutions using 3x3 filters if
-//  // parameters are supported.
-//  if (Fast3x3FilterKernelSupported(
-//          input_shape, filter_shape, stride_width, stride_height,
-//          dilation_width_factor, dilation_height_factor, pad_width, pad_height,
-//          depth_multiplier, output_shape, output_shift)) {
-//    DepthwiseConv3x3Filter(params, input_shape, input_data, filter_shape,
-//                           filter_data, bias_shape, bias_data, output_shape,
-//                           output_data);
-//    return;
-//  }
-#endif
-
-  optimized::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data,
-                                  bias_shape, bias_data, output_shape, output_data);
+  DepthwiseConvWorkerTask(const DepthwiseConvParams &params, const Shape &input_shape,
+                          const T *input_data, const Shape &filter_shape, const T *filter_data,
+                          const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
+                          T *output_data, int thread_start, int thread_end, int thread_dim)
+    : params_(params), input_shape_(input_shape), input_data_(input_data),
+      filter_shape_(filter_shape), filter_data_(filter_data), bias_shape_(bias_shape),
+      bias_data_(bias_data), output_shape_(output_shape), output_data_(output_data),
+      thread_start_(thread_start), thread_end_(thread_end), thread_dim_(thread_dim)
+  {
+  }
+
+  void Run() override
+  {
+    optimized::DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_, filter_data_,
+                                 bias_shape_, bias_data_, output_shape_, output_data_,
+                                 thread_start_, thread_end_, thread_dim_);
+  }
+
+private:
+  const DepthwiseConvParams &params_;
+  const Shape &input_shape_;
+  const T *input_data_;
+  const Shape &filter_shape_;
+  const T *filter_data_;
+  const Shape &bias_shape_;
+  const TS *bias_data_;
+  const Shape &output_shape_;
+  T *output_data_;
+  // const CpuFlags& cpu_flags_;
+  int thread_start_;
+  int thread_end_;
+  int thread_dim_;
+};
+
+inline int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape)
+{
+  // How many scalar multiplications are needed to make it worth using one
+  // more thread
+  static constexpr int kMinMulPerThread = 1 << 13; // 8k
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int num_muls = output_shape.FlatSize() * filter_height * filter_width;
+  // Try to avoid real runtime divisions if possible by dividing by a
+  // compile-time constant.
+  int thread_count = std::max(1, num_muls / kMinMulPerThread);
+  return thread_count;
+}
+
+inline bool MultithreadAlongBatches(int thread_count, int batches)
+{
+  assert(thread_count >= 2);
+  // If there are fewer batch entries than the number of threads we want to use,
+  // then better do intra-batch-entry multithreading.
+  if (batches < thread_count)
+  {
+    return false;
+  }
+  // If there are at least 2 batch entries to be handed to each thread, then
+  // it's safe to proceed with batch-wise multithreading: each thread will have
+  // approximately equal number of batch entries to handle, so the load
+  // balancing will be reasonable, and the amount to which the load is not
+  // perfectly balanced will be offset by the inherent advantages of
+  // batch-wise multithreading (each thread is more efficient thanks to working
+  // on larger buffers with less boundary-handling overhead).
+  if (batches >= 2 * thread_count)
+  {
+    return true;
+  }
+  // In the limit case were there are at least 1 but not much more than 1
+  // batch entries per thread, it may be a good idea to do per-batch
+  // multithreading if the number of batch entries is a multiple of the number
+  // of threads, so that each thread will have the same number of batch entries
+  // to process.
+  return ((batches % thread_count) == 0);
 }
 
+template <typename T, typename TS>
 inline void DepthwiseConv(const DepthwiseConvParams &params, const Shape &input_shape,
-                          const float *input_data, const Shape &filter_shape,
-                          const float *filter_data, const Shape &bias_shape, const float *bias_data,
-                          const Shape &output_shape, float *output_data)
+                          const T *input_data, const Shape &filter_shape, const T *filter_data,
+                          const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
+                          T *output_data, ruy::Context *ruy_context)
 {
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int depth_multiplier = params.depth_multiplier;
-  const float output_activation_min = params.float_activation_min;
-  const float output_activation_max = params.float_activation_max;
   assert(input_shape.DimensionsCount() == 4);
   assert(filter_shape.DimensionsCount() == 4);
   assert(output_shape.DimensionsCount() == 4);
 
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int input_depth = input_shape.Dims(3);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
+  int thread_count = HowManyConvThreads(output_shape, filter_shape);
+
+  // NOTE Borrow RuyContext to get max_num_threads setting
+  // TODO Define and use max_num_threads for CPU backend
+  const auto max_threads = (ruy_context == nullptr) ? 1 : ruy_context->max_num_threads();
+
+  thread_count = std::max(1, std::min(thread_count, max_threads));
+  // Cap the number of threads to 2 for float path to avoid regression in
+  // performance (b/132294857).
+  if (std::is_floating_point<T>::value)
+  {
+    thread_count = std::min(thread_count, 2);
+  }
+
+  const int output_batches = output_shape.Dims(0);
   const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  assert(output_depth == input_depth * depth_multiplier);
-  assert(bias_shape.FlatSize() == output_depth);
-  UNUSED_RELEASE(output_depth);
-  UNUSED_RELEASE(bias_shape);
 
-  for (int b = 0; b < batches; ++b)
+  if (thread_count == 1)
+  {
+    optimized::DepthwiseConvImpl(params, input_shape, input_data, filter_shape, filter_data,
+                                 bias_shape, bias_data, output_shape, output_data, 0, output_height,
+                                 1);
+    return;
+  }
+
+  int thread_dim, thread_dim_size;
+  if (MultithreadAlongBatches(thread_count, output_batches))
+  {
+    thread_dim = 0;
+    thread_dim_size = output_batches;
+  }
+  else
+  {
+    thread_dim = 1;
+    thread_dim_size = output_height;
+  }
+
+  std::vector<DepthwiseConvWorkerTask<T, TS>> tasks;
+  // TODO(b/131746020) don't create new heap allocations every time.
+  // At least we make it a single heap allocation by using reserve().
+  tasks.reserve(thread_count);
+  int thread_start = 0;
+  for (int i = 0; i < thread_count; ++i)
   {
-    for (int out_y = 0; out_y < output_height; ++out_y)
-    {
-      for (int out_x = 0; out_x < output_width; ++out_x)
-      {
-        for (int ic = 0; ic < input_depth; ++ic)
-        {
-          for (int m = 0; m < depth_multiplier; m++)
-          {
-            const int oc = m + ic * depth_multiplier;
-            const int in_x_origin = (out_x * stride_width) - pad_width;
-            const int in_y_origin = (out_y * stride_height) - pad_height;
-            float total = 0.f;
-            for (int filter_y = 0; filter_y < filter_height; ++filter_y)
-            {
-              for (int filter_x = 0; filter_x < filter_width; ++filter_x)
-              {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y = in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
-                {
-                  float input_value = input_data[Offset(input_shape, b, in_y, in_x, ic)];
-                  float filter_value = filter_data[Offset(filter_shape, 0, filter_y, filter_x, oc)];
-                  total += (input_value * filter_value);
-                }
-              }
-            }
-            float bias_value = 0.0f;
-            if (bias_data)
-            {
-              bias_value = bias_data[oc];
-            }
-            output_data[Offset(output_shape, b, out_y, out_x, oc)] = ActivationFunctionWithMinMax(
-                total + bias_value, output_activation_min, output_activation_max);
-          }
-        }
-      }
-    }
+    int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i);
+    tasks.emplace_back(params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+                       bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
+    thread_start = thread_end;
   }
+  cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context);
 }
 
 } // namespace cker
diff --git a/compute/cker/include/cker/operation/ELU.h b/compute/cker/include/cker/operation/ELU.h
new file mode 100644
index 000000000..6bdd7c62e
--- /dev/null
+++ b/compute/cker/include/cker/operation/ELU.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_ELU_H__
+#define __NNFW_CKER_ELU_H__
+
+#include "cker/Shape.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void ELU(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const float val = input_data[i];
+    output_data[i] = val < 0.0 ? std::exp(val) - 1 : val;
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_ELU_H__
diff --git a/compute/cker/include/cker/operation/Einsum.h b/compute/cker/include/cker/operation/Einsum.h
index 3d1837f47..13fccfd15 100644
--- a/compute/cker/include/cker/operation/Einsum.h
+++ b/compute/cker/include/cker/operation/Einsum.h
@@ -394,8 +394,8 @@ private:
     for (int label = 0; label < num_labels; ++label)
     {
       bool removed = (_output_label_counts[label] == 0);
-      bool unique = num_inputs == 1 || _input_label_counts[0][label] == 0 ||
-                    _input_label_counts[1][label] == 0;
+      bool unique =
+        num_inputs == 1 || _input_label_counts[0][label] == 0 || _input_label_counts[1][label] == 0;
       _label_types[label] = getDimensionType(removed, unique);
     }
   }
@@ -483,8 +483,8 @@ private:
       if (inputs[i].shape.DimensionsCount() + 1 < (int32_t)labels->size())
       {
         throw std::runtime_error{"Expected input " + std::to_string(i) + " to have rank at least " +
-                                 std::to_string(labels->size() - 1) + " but got: " +
-                                 std::to_string(inputs[i].shape.DimensionsCount())};
+                                 std::to_string(labels->size() - 1) +
+                                 " but got: " + std::to_string(inputs[i].shape.DimensionsCount())};
       }
       int ellipsis_axis = -1;
       const int num_bcast_dims = inputs[i].shape.DimensionsCount() - labels->size() + 1;
@@ -511,7 +511,7 @@ private:
     }
 
     std::vector<bool>::iterator it_input =
-        std::find(_input_has_ellipsis.begin(), _input_has_ellipsis.end(), true);
+      std::find(_input_has_ellipsis.begin(), _input_has_ellipsis.end(), true);
     if (it_input == _input_has_ellipsis.end() && !_output_has_ellipsis)
     {
       return;
@@ -645,11 +645,11 @@ private:
 
     // Reduce along the last axis (i.e axis 1) of the rank-2 Tensor.
     const int32_t output_size =
-        reshape[kBroadcasting] * reshape[kBatch] * reshape[kFree] * reshape[kContract];
+      reshape[kBroadcasting] * reshape[kBatch] * reshape[kFree] * reshape[kContract];
     functor::ReduceFunctor<Eigen::ThreadPoolDevice, Reducer>::Reduce(
-        device, output->shaped<T, 1>({output_size}),
-        input_deduped.shaped<T, 2>({output_size, reshape[kReduce]}), Eigen::array<Index, 1>({1}),
-        Reducer());
+      device, output->shaped<T, 1>({output_size}),
+      input_deduped.shaped<T, 2>({output_size, reshape[kReduce]}), Eigen::array<Index, 1>({1}),
+      Reducer());
   }
 
   bool shouldSwapFreeAndContract(const Labels &labels,
@@ -779,7 +779,7 @@ private:
     {
       const int32_t count = label_counts[label];
       const int current_axis =
-          should_inflate ? strided_shape_dims.size() : inflated_shape_dims.size();
+        should_inflate ? strided_shape_dims.size() : inflated_shape_dims.size();
       const int32_t dim = input.shape.Dims(current_axis);
       strided_shape_dims.push_back(dim);
       inflated_shape_dims.insert(inflated_shape_dims.end(), count, dim);
@@ -879,7 +879,7 @@ private:
     for (size_t i = 0; i < inputs.size(); ++i)
     {
       const int32_t free_axis =
-          inputs[i].shape.DimensionsCount() - (swap_free_and_contract[i] ? 1 : 2);
+        inputs[i].shape.DimensionsCount() - (swap_free_and_contract[i] ? 1 : 2);
       output_shape.SetDim(i + old_output_shape.DimensionsCount(), inputs[i].shape.Dims(free_axis));
     }
     bool adj_x = swap_free_and_contract[0];
diff --git a/compute/cker/include/cker/operation/Elementwise.h b/compute/cker/include/cker/operation/Elementwise.h
index 9d080d89b..0e980f18e 100644
--- a/compute/cker/include/cker/operation/Elementwise.h
+++ b/compute/cker/include/cker/operation/Elementwise.h
@@ -98,6 +98,28 @@ inline void Floor(const Shape &input_shape, const float *input_data, const Shape
   }
 }
 
+inline void Sqrt(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                 float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = std::sqrt(input_data[i]);
+  }
+}
+
+inline void Square(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+                   float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = input_data[i] * input_data[i];
+  }
+}
+
 } // namespace cker
 } // namespace nnfw
 
diff --git a/compute/cker/include/cker/operation/Fill.h b/compute/cker/include/cker/operation/Fill.h
index 14daf9839..d657acc12 100644
--- a/compute/cker/include/cker/operation/Fill.h
+++ b/compute/cker/include/cker/operation/Fill.h
@@ -24,27 +24,12 @@ namespace nnfw
 {
 namespace cker
 {
-template <typename T>
-inline void Fill(const Shape &input_shape, int *input_data, const T value_data,
-                 const Shape &output_shape, T output_data)
+template <typename T> inline void Fill(const T value_data, const Shape &output_shape, T output_data)
 {
-  int input_size = input_shape.FlatSize();
-  int output_size = 1;
-  for (int i = 0; i < input_size; i++)
+  int output_size = output_shape.FlatSize();
+  for (int i = 0; i < output_size; i++)
   {
-    output_size *= input_data[i];
-  }
-
-  if (output_size == output_shape.FlatSize())
-  {
-    for (int i = 0; i < output_size; i++)
-    {
-      output_data[i] = *value_data;
-    }
-  }
-  else
-  {
-    throw std::runtime_error("Cker Fill.h: output's size is not matched inferred size of output");
+    output_data[i] = *value_data;
   }
 }
 
diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h
index 958532402..b7d27e85d 100644
--- a/compute/cker/include/cker/operation/FullyConnected.h
+++ b/compute/cker/include/cker/operation/FullyConnected.h
@@ -117,7 +117,7 @@ inline void FullyConnected(const FullyConnectedParams &params, const Shape &inpu
   const int filter_dim_count = filter_shape.DimensionsCount();
   const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
   const int output_depth =
-      MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
+    MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
   for (int b = 0; b < batches; ++b)
   {
@@ -229,7 +229,7 @@ inline void FullyConnectedSparseWeightRandom(const FullyConnectedParams &params,
   const int weights_dims_count = weights_shape.DimensionsCount();
   const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
   const int output_depth =
-      MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
+    MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
   const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
 
   UNUSED_RELEASE(bias_shape);
@@ -249,7 +249,7 @@ inline void FullyConnectedSparseWeightRandom(const FullyConnectedParams &params,
       {
         int idx_1 = w1_indices[pw1];
         output_data[b * output_depth + idx_0] +=
-            weights_data[pw1] * input_data[b * accum_depth + idx_1];
+          weights_data[pw1] * input_data[b * accum_depth + idx_1];
       }
     }
   }
diff --git a/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h
index 28ae7a3bc..df397f73e 100644
--- a/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h
+++ b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h
@@ -70,7 +70,7 @@ inline void FullyConnectedSparseWeight16x1(const FullyConnectedParams &params,
   const int weights_dims_count = weights_shape.DimensionsCount();
   const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
   const int output_depth =
-      MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
+    MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
   const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
 
   UNUSED_RELEASE(bias_shape);
diff --git a/compute/cker/include/cker/operation/FusedBatchNorm.h b/compute/cker/include/cker/operation/FusedBatchNorm.h
index d17a5796b..8a97d8421 100644
--- a/compute/cker/include/cker/operation/FusedBatchNorm.h
+++ b/compute/cker/include/cker/operation/FusedBatchNorm.h
@@ -105,7 +105,7 @@ public:
     float rest_size_inv = static_cast<float>(1.0f / static_cast<float>(rest_size));
     // This adjustment is for Bessel's correction
     float rest_size_adjust =
-        static_cast<float>(rest_size) / static_cast<float>(rest_size_minus_one);
+      static_cast<float>(rest_size) / static_cast<float>(rest_size_minus_one);
 
     Eigen::Tensor<float, 1, Eigen::RowMajor> batch_mean(depth);
     Eigen::Tensor<float, 1, Eigen::RowMajor> batch_variance(depth);
@@ -117,12 +117,12 @@ public:
 
     batch_variance.device(d) = x_centered.square().sum(reduce_dims) * rest_size_inv;
     auto scaling_factor = ((batch_variance + param.epsilon).rsqrt() * scale)
-                              .eval()
-                              .reshape(one_by_depth)
-                              .broadcast(bcast_spec);
+                            .eval()
+                            .reshape(one_by_depth)
+                            .broadcast(bcast_spec);
     auto x_scaled = x_centered * scaling_factor;
     auto x_shifted =
-        (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec)).template cast<float>();
+      (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec)).template cast<float>();
 
     UNUSED_RELEASE(rest_size_adjust);
 
diff --git a/compute/cker/include/cker/operation/Helper/BCast.h b/compute/cker/include/cker/operation/Helper/BCast.h
index a0abf2935..211db98ce 100644
--- a/compute/cker/include/cker/operation/Helper/BCast.h
+++ b/compute/cker/include/cker/operation/Helper/BCast.h
@@ -22,7 +22,7 @@
  * ToDo : This file will be moved into upper folder when integrate with other
  *        custom operations.
  *        And It should merged with EinsumHelper's BCast.
-**/
+ **/
 
 #include "cker/Shape.h"
 #include "cker/eigen/EigenSupport.h"
@@ -393,7 +393,7 @@ public:
 
   BCast(const Vec &x, const Vec &y, const bool fewer_dims_optimization = true,
         const bool return_flattened_batch_indices = false)
-      : BCastList<2>({x, y}, fewer_dims_optimization, return_flattened_batch_indices)
+    : BCastList<2>({x, y}, fewer_dims_optimization, return_flattened_batch_indices)
   {
   }
 
diff --git a/compute/cker/include/cker/operation/Helper/RandomDistributions.h b/compute/cker/include/cker/operation/Helper/RandomDistributions.h
index baeafd7c9..cbebff142 100644
--- a/compute/cker/include/cker/operation/Helper/RandomDistributions.h
+++ b/compute/cker/include/cker/operation/Helper/RandomDistributions.h
@@ -168,7 +168,7 @@ public:
 
   // Must have lo < hi
   UniformDistribution(int32_t lo, int32_t hi)
-      : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo))
+    : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo))
   {
   }
 
@@ -207,7 +207,7 @@ public:
 
   // Must have lo < hi
   UniformDistribution(int64_t lo, int64_t hi)
-      : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo))
+    : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo))
   {
   }
 
@@ -291,22 +291,22 @@ public:
 
 template <typename Generator>
 class UniformFullIntDistribution<Generator, int32_t>
-    : public UniformFullIntDistribution32<Generator, int32_t>
+  : public UniformFullIntDistribution32<Generator, int32_t>
 {
 };
 template <typename Generator>
 class UniformFullIntDistribution<Generator, uint32_t>
-    : public UniformFullIntDistribution32<Generator, uint32_t>
+  : public UniformFullIntDistribution32<Generator, uint32_t>
 {
 };
 template <typename Generator>
 class UniformFullIntDistribution<Generator, int64_t>
-    : public UniformFullIntDistribution64<Generator, int64_t>
+  : public UniformFullIntDistribution64<Generator, int64_t>
 {
 };
 template <typename Generator>
 class UniformFullIntDistribution<Generator, uint64_t>
-    : public UniformFullIntDistribution64<Generator, uint64_t>
+  : public UniformFullIntDistribution64<Generator, uint64_t>
 {
 };
 
@@ -324,7 +324,7 @@ public:
 
   PHILOX_DEVICE_INLINE
   explicit SingleSampleAdapter(Generator *gen)
-      : generator_(gen), used_result_index_(Generator::kResultElementCount)
+    : generator_(gen), used_result_index_(Generator::kResultElementCount)
   {
   }
 
@@ -615,8 +615,8 @@ class TruncatedNormalDistribution<SingleSampleGenerator, double>
 public:
   // The number of elements that will be returned.
   static constexpr int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1)
-                                                 ? SingleSampleGenerator::kNativeElementCount / 2
-                                                 : 1;
+                                               ? SingleSampleGenerator::kNativeElementCount / 2
+                                               : 1;
   // Cost of generation of a single element (in cycles).
   static constexpr int kElementCost = 90;
   // Indicate that this distribution may take variable number of samples
diff --git a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
index 85d267723..6e9ffbdfd 100644
--- a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
+++ b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
@@ -109,7 +109,7 @@ template <class Distribution> struct FillPhiloxRandomTask<Distribution, true>
   {
     const int kGroupSize = Distribution::kResultElementCount;
     static const int kGeneratorSkipPerOutputGroup =
-        kGroupSize * kReservedSamplesPerOutput / PhiloxRandom::kResultElementCount;
+      kGroupSize * kReservedSamplesPerOutput / PhiloxRandom::kResultElementCount;
 
     int64_t offset = 0;
 
diff --git a/compute/cker/include/cker/operation/Helper/Tensor.h b/compute/cker/include/cker/operation/Helper/Tensor.h
index e6ac008a5..ec29a15c3 100644
--- a/compute/cker/include/cker/operation/Helper/Tensor.h
+++ b/compute/cker/include/cker/operation/Helper/Tensor.h
@@ -29,58 +29,58 @@ template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex> str
 {
   // Rank-<NDIMS> tensor of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
-      Tensor;
+    Tensor;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>,
                            Eigen::Aligned>
-      ConstTensor;
+    ConstTensor;
 
   // Unaligned Rank-<NDIMS> tensor of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>> UnalignedTensor;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>>
-      UnalignedConstTensor;
+    UnalignedConstTensor;
 
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>, Eigen::Aligned>
-      Tensor32Bit;
+    Tensor32Bit;
 
   // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
   typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>,
                            Eigen::Aligned>
-      Scalar;
+    Scalar;
   typedef Eigen::TensorMap<
-      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned>
-      ConstScalar;
+    Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+    ConstScalar;
 
   // Unaligned Scalar tensor of scalar type T.
   typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>>
-      UnalignedScalar;
+    UnalignedScalar;
   typedef Eigen::TensorMap<
-      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>>
-      UnalignedConstScalar;
+    Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>>
+    UnalignedConstScalar;
 
   // Rank-1 tensor (vector) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Flat;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
-      ConstFlat;
+    ConstFlat;
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Vec;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
-      ConstVec;
+    ConstVec;
 
   // Unaligned Rank-1 tensor (vector) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>> UnalignedFlat;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>>
-      UnalignedConstFlat;
+    UnalignedConstFlat;
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>> UnalignedVec;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>> UnalignedConstVec;
 
   // Rank-2 tensor (matrix) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned> Matrix;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned>
-      ConstMatrix;
+    ConstMatrix;
 
   // Unaligned Rank-2 tensor (matrix) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>> UnalignedMatrix;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>>
-      UnalignedConstMatrix;
+    UnalignedConstMatrix;
 };
 
 typedef typename TTypes<float, 1>::Tensor32Bit::Index Index32;
diff --git a/compute/cker/include/cker/operation/InstanceNorm.h b/compute/cker/include/cker/operation/InstanceNorm.h
index 6445e8a2b..8fa8b03bc 100644
--- a/compute/cker/include/cker/operation/InstanceNorm.h
+++ b/compute/cker/include/cker/operation/InstanceNorm.h
@@ -78,8 +78,8 @@ inline void InstanceNorm(const InstanceNormParams &params, const Shape &input_sh
           double input_value = input_data[Offset(output_shape, batch, height, width, channel)];
           double output_value = input_value * a + b;
           output_data[Offset(output_shape, batch, height, width, channel)] =
-              ActivationFunctionWithMinMax((float)output_value, output_activation_min,
-                                           output_activation_max);
+            ActivationFunctionWithMinMax((float)output_value, output_activation_min,
+                                         output_activation_max);
         }
       }
     }
diff --git a/compute/cker/include/cker/operation/L2Normalize.h b/compute/cker/include/cker/operation/L2Normalize.h
index a0075c3d0..c1fca91cc 100644
--- a/compute/cker/include/cker/operation/L2Normalize.h
+++ b/compute/cker/include/cker/operation/L2Normalize.h
@@ -77,7 +77,7 @@ void L2NormalizeQuant8(L2NormParams &params, const Shape &input_shape, const uin
     {
       int32_t diff = *input_data - input_zero_point;
       int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+        128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
       int32_t unclamped_output_val = 128 + rescaled_diff;
       int32_t output_val = std::min(static_cast<int32_t>(255),
                                     std::max(static_cast<int32_t>(0), unclamped_output_val));
diff --git a/compute/cker/include/cker/operation/LSTM.h b/compute/cker/include/cker/operation/LSTM.h
index 27beaaead..a8f1f8ca3 100644
--- a/compute/cker/include/cker/operation/LSTM.h
+++ b/compute/cker/include/cker/operation/LSTM.h
@@ -283,23 +283,23 @@ void CalculateLstmOutputFloat(int n_batch, int n_cell, int n_output, const float
 // contiguous, and we manually loop over the batched outputs.
 // LINT.IfChange
 inline void LstmStepFloat(
-    const float *input_ptr, const float *input_to_input_weights_ptr,
-    const float *input_to_forget_weights_ptr, const float *input_to_cell_weights_ptr,
-    const float *input_to_output_weights_ptr, const float *aux_input_ptr,
-    const float *aux_input_to_input_weights_ptr, const float *aux_input_to_forget_weights_ptr,
-    const float *aux_input_to_cell_weights_ptr, const float *aux_input_to_output_weights_ptr,
-    const float *recurrent_to_input_weights_ptr, const float *recurrent_to_forget_weights_ptr,
-    const float *recurrent_to_cell_weights_ptr, const float *recurrent_to_output_weights_ptr,
-    const float *cell_to_input_weights_ptr, const float *cell_to_forget_weights_ptr,
-    const float *cell_to_output_weights_ptr, const float *input_layer_norm_coefficients_ptr,
-    const float *forget_layer_norm_coefficients_ptr, const float *cell_layer_norm_coefficients_ptr,
-    const float *output_layer_norm_coefficients_ptr, const float *input_gate_bias_ptr,
-    const float *forget_gate_bias_ptr, const float *cell_gate_bias_ptr,
-    const float *output_gate_bias_ptr, const float *projection_weights_ptr,
-    const float *projection_bias_ptr, const LSTMParams *params, int n_batch, int n_cell,
-    int n_input, int n_aux_input, int n_output, int output_batch_leading_dim,
-    float *output_state_ptr, float *cell_state_ptr, float *scratch0, float *scratch1,
-    float *scratch2, float *scratch3, float *output_ptr)
+  const float *input_ptr, const float *input_to_input_weights_ptr,
+  const float *input_to_forget_weights_ptr, const float *input_to_cell_weights_ptr,
+  const float *input_to_output_weights_ptr, const float *aux_input_ptr,
+  const float *aux_input_to_input_weights_ptr, const float *aux_input_to_forget_weights_ptr,
+  const float *aux_input_to_cell_weights_ptr, const float *aux_input_to_output_weights_ptr,
+  const float *recurrent_to_input_weights_ptr, const float *recurrent_to_forget_weights_ptr,
+  const float *recurrent_to_cell_weights_ptr, const float *recurrent_to_output_weights_ptr,
+  const float *cell_to_input_weights_ptr, const float *cell_to_forget_weights_ptr,
+  const float *cell_to_output_weights_ptr, const float *input_layer_norm_coefficients_ptr,
+  const float *forget_layer_norm_coefficients_ptr, const float *cell_layer_norm_coefficients_ptr,
+  const float *output_layer_norm_coefficients_ptr, const float *input_gate_bias_ptr,
+  const float *forget_gate_bias_ptr, const float *cell_gate_bias_ptr,
+  const float *output_gate_bias_ptr, const float *projection_weights_ptr,
+  const float *projection_bias_ptr, const LSTMParams *params, int n_batch, int n_cell, int n_input,
+  int n_aux_input, int n_output, int output_batch_leading_dim, float *output_state_ptr,
+  float *cell_state_ptr, float *scratch0, float *scratch1, float *scratch2, float *scratch3,
+  float *output_ptr)
 {
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to the get the condition.
@@ -314,7 +314,7 @@ inline void LstmStepFloat(
   // Check if inputs are all zeros so we can skip some computations.
   const bool is_input_all_zeros = IsZeroVector(input_ptr, n_batch * n_input);
   const bool is_aux_input_all_zeros =
-      (aux_input_ptr == nullptr || IsZeroVector(aux_input_ptr, n_batch * n_aux_input));
+    (aux_input_ptr == nullptr || IsZeroVector(aux_input_ptr, n_batch * n_aux_input));
   if (!use_cifg)
   {
     // Calculate the input gate. (If not CIFG.)
@@ -336,11 +336,11 @@ inline void LstmStepFloat(
                          forget_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros);
   // Calculate the cell update gate.
   CalculateLstmGateFloat(
-      input_ptr, input_to_cell_weights_ptr, aux_input_ptr, aux_input_to_cell_weights_ptr,
-      output_state_ptr, recurrent_to_cell_weights_ptr, /*cell_state=*/nullptr,
-      /*cell_to_gate_weights=*/nullptr, cell_layer_norm_coefficients_ptr, cell_gate_bias_ptr,
-      n_batch, n_input, n_aux_input, n_output, n_cell, params->activation, cell_gate_scratch,
-      is_input_all_zeros, is_aux_input_all_zeros);
+    input_ptr, input_to_cell_weights_ptr, aux_input_ptr, aux_input_to_cell_weights_ptr,
+    output_state_ptr, recurrent_to_cell_weights_ptr, /*cell_state=*/nullptr,
+    /*cell_to_gate_weights=*/nullptr, cell_layer_norm_coefficients_ptr, cell_gate_bias_ptr, n_batch,
+    n_input, n_aux_input, n_output, n_cell, params->activation, cell_gate_scratch,
+    is_input_all_zeros, is_aux_input_all_zeros);
   // Update the cell state.
   UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch, forget_gate_scratch,
                       cell_gate_scratch, use_cifg, params->cell_clip);
diff --git a/compute/cker/include/cker/operation/LeakyReLU.h b/compute/cker/include/cker/operation/LeakyReLU.h
new file mode 100644
index 000000000..e12d01bba
--- /dev/null
+++ b/compute/cker/include/cker/operation/LeakyReLU.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_LEKAY_RELU_H__
+#define __NNFW_CKER_LEKAY_RELU_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void LeakyReLU(const LeakyReluParams &params, const Shape &input_shape,
+                      const float *input_data, const Shape &output_shape, float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    const float val = input_data[i];
+    // Note that alpha might be > 1 or < 0, so we don't use std::max here.
+    output_data[i] = val > 0 ? val : val * params.alpha;
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_RELU_H__
diff --git a/compute/cker/include/cker/operation/LogSoftMax.h b/compute/cker/include/cker/operation/LogSoftMax.h
index 326a44f0c..eb7bdd900 100644
--- a/compute/cker/include/cker/operation/LogSoftMax.h
+++ b/compute/cker/include/cker/operation/LogSoftMax.h
@@ -71,7 +71,7 @@ inline void LogSoftmax(const SoftmaxParams &params, const Shape &input_shape,
       for (int c = 0; c < depth; ++c)
       {
         output_data[(i * depth + c) * inner_size + j] =
-            (input_data[(i * depth + c) * inner_size + j] - max) * beta - log_sum;
+          (input_data[(i * depth + c) * inner_size + j] - max) * beta - log_sum;
       }
     }
   }
@@ -124,10 +124,10 @@ inline void LogSoftmax(const SoftmaxParams &params, float input_scale, const Sha
       for (int c = 0; c < depth; ++c)
       {
         const float log_prob =
-            scale * input_data[(i * depth + c) * inner_size] * beta - precomputed;
+          scale * input_data[(i * depth + c) * inner_size] * beta - precomputed;
         const int32_t prob_quantized = std::rint(log_prob) + params.zero_point;
         output_data[(i * depth + c) * inner_size] =
-            static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
+          static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
       }
     }
   }
diff --git a/compute/cker/include/cker/operation/LogicalAnd.h b/compute/cker/include/cker/operation/LogicalAnd.h
new file mode 100644
index 000000000..e877f5f47
--- /dev/null
+++ b/compute/cker/include/cker/operation/LogicalAnd.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_LOGICAL_AND_H__
+#define __NNFW_CKER_LOGICAL_AND_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void LogicalAndBroadcast(const Shape &unextended_input1_shape, const T *input1_data,
+                                const Shape &unextended_input2_shape, const T *input2_data,
+                                const Shape &unextended_output_shape, T *output_data)
+{
+  assert(unextended_input1_shape.DimensionsCount() <= 4);
+  assert(unextended_input2_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
+                                      &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < output_shape.Dims(3); ++c)
+        {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = in1_val && in2_val;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void LogicalAndElementwise(const Shape &shape, const T *input1_data, const T *input2_data,
+                                  T *output_data)
+{
+
+  int num_elements = shape.FlatSize();
+
+  for (int t = 0; t < num_elements; t++)
+  {
+    output_data[t] = input1_data[t] && input2_data[t];
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_LOGICAL_AND_H__
diff --git a/compute/cker/include/cker/operation/MatrixBandPart.h b/compute/cker/include/cker/operation/MatrixBandPart.h
index 5674ff3ef..ef2868455 100644
--- a/compute/cker/include/cker/operation/MatrixBandPart.h
+++ b/compute/cker/include/cker/operation/MatrixBandPart.h
@@ -43,11 +43,11 @@ void MatrixBandPart(const T num_lower_diags, const T num_upper_diags, const Shap
 
   if (!(num_lower_diags <= row_num))
     throw std::runtime_error(
-        "MatrixBandPart : num_lower must be negative or less or equal to number of rows");
+      "MatrixBandPart : num_lower must be negative or less or equal to number of rows");
 
   if (!(num_upper_diags <= col_num))
     throw std::runtime_error(
-        "MatrixBandPart : num_upper must be negative or less or equal to number of columns");
+      "MatrixBandPart : num_upper must be negative or less or equal to number of columns");
 
   std::fill(output_data, output_data + output_shape.FlatSize(), 0); // output matrix init
 
@@ -60,9 +60,10 @@ void MatrixBandPart(const T num_lower_diags, const T num_upper_diags, const Shap
       auto input = input_data + (batch * row_num * col_num + row * col_num);
 
       const T band_start =
-          num_lower_diags < 0 ? 0 : std::min(col_num, std::max(T{0}, row - num_lower_diags));
-      const T band_end = num_upper_diags < 0 ? col_num : std::min(static_cast<T>(col_num),
-                                                                  row + num_upper_diags + 1);
+        num_lower_diags < 0 ? 0 : std::min(col_num, std::max(T{0}, row - num_lower_diags));
+      const T band_end = num_upper_diags < 0
+                           ? col_num
+                           : std::min(static_cast<T>(col_num), row + num_upper_diags + 1);
 
       for (T band_idx = band_start; band_idx < band_end; band_idx++)
       {
diff --git a/compute/cker/include/cker/operation/MaxPool.h b/compute/cker/include/cker/operation/MaxPool.h
index ea3fcaca6..5dc84d368 100644
--- a/compute/cker/include/cker/operation/MaxPool.h
+++ b/compute/cker/include/cker/operation/MaxPool.h
@@ -67,10 +67,10 @@ void MaxPool<float>(const PoolParams &params, const Shape &input_shape, const fl
         int hpad = h + params.padding_values.height;
         int wpad = w + params.padding_values.width;
         int h_start =
-            (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
+          (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
         int h_end = std::min(hpad / stride_height + 1, output_height);
         int w_start =
-            (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
+          (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
         int w_end = std::min(wpad / stride_width + 1, output_width);
         // compute elementwise sum
         for (int ph = h_start; ph < h_end; ++ph)
@@ -79,8 +79,8 @@ void MaxPool<float>(const PoolParams &params, const Shape &input_shape, const fl
           {
             int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
             out_mat.col(out_offset) =
-                out_mat.col(out_offset)
-                    .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width)));
+              out_mat.col(out_offset)
+                .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width)));
           }
         }
       }
@@ -139,8 +139,8 @@ void MaxPool<uint8_t>(const PoolParams &params, const Shape &input_shape, const
           const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
           memset(acc, 0, tranche_depth * sizeof(acc[0]));
           const uint8_t *input_ptr =
-              input_data + depth_base +
-              depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+            input_data + depth_base +
+            depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
           for (int fy = filter_y_start; fy < filter_y_end; fy++)
           {
             const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
diff --git a/compute/cker/include/cker/operation/OneHot.h b/compute/cker/include/cker/operation/OneHot.h
index c0dbc6df5..ddc27b4c2 100644
--- a/compute/cker/include/cker/operation/OneHot.h
+++ b/compute/cker/include/cker/operation/OneHot.h
@@ -55,7 +55,7 @@ void OneHot(const int32_t depth, const T on_value, const T off_value, int32_t ax
       for (int k = 0; k < suffix_dim_size; ++k, ++output_data)
       {
         *output_data =
-            static_cast<int>(indices_data[i * suffix_dim_size + k]) == j ? on_value : off_value;
+          static_cast<int>(indices_data[i * suffix_dim_size + k]) == j ? on_value : off_value;
       }
     }
   }
diff --git a/compute/cker/include/cker/operation/Range.h b/compute/cker/include/cker/operation/Range.h
index 5c3a773a2..d6ccc68c8 100644
--- a/compute/cker/include/cker/operation/Range.h
+++ b/compute/cker/include/cker/operation/Range.h
@@ -35,8 +35,8 @@ template <typename T> inline int GetSize(T start, T limit, T delta)
   }
 
   int size = (std::is_integral<T>::value
-                  ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
-                  : std::ceil(std::abs((limit - start) / delta)));
+                ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
+                : std::ceil(std::abs((limit - start) / delta)));
   return size;
 }
 
diff --git a/compute/cker/include/cker/operation/Reduce.h b/compute/cker/include/cker/operation/Reduce.h
index 2b2e8d338..dbf938147 100644
--- a/compute/cker/include/cker/operation/Reduce.h
+++ b/compute/cker/include/cker/operation/Reduce.h
@@ -50,7 +50,7 @@ inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape
   {
     int r_idx = 0;
     float tmp_data[4] = {
-        0,
+      0,
     };
     float32x4_t tmp_data_32x4 = vld1q_f32(tmp_data);
     for (; r_idx <= reduce_size - 32; r_idx += 32)
@@ -143,7 +143,7 @@ inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Sha
   {
     size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
     size_t output_offset =
-        ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
+      ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
     output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]);
   } while (NextIndex(input_num_dims, input_dims, input_iter));
   return true;
@@ -319,7 +319,7 @@ public:
         for (size_t idx = 0; idx < num_outputs; ++idx)
         {
           const U value =
-              static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point;
+            static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point;
           output_data[idx] = static_cast<T>(value);
         }
       }
@@ -329,7 +329,7 @@ public:
         for (size_t idx = 0; idx < num_outputs; ++idx)
         {
           float float_mean =
-              static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis);
+            static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis);
           float result = std::min(std::round(float_mean * scale + bias) + output_zero_point,
                                   static_cast<float>(std::numeric_limits<T>::max()));
           result = std::max(result, static_cast<float>(std::numeric_limits<T>::min()));
diff --git a/compute/cker/include/cker/operation/ReduceMean.h b/compute/cker/include/cker/operation/ReduceMean.h
index 2e4fc6274..924e85037 100644
--- a/compute/cker/include/cker/operation/ReduceMean.h
+++ b/compute/cker/include/cker/operation/ReduceMean.h
@@ -72,9 +72,9 @@ inline bool ReduceMeanImpl(const In *input_data, const Shape &input_shape, const
   {
     size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
     size_t output_offset =
-        ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
+      ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
     output_data[output_offset] =
-        reducer(output_data[output_offset], input_data[input_offset], normalizer);
+      reducer(output_data[output_offset], input_data[input_offset], normalizer);
   } while (NextIndex(input_num_dims, input_dims, input_iter));
   return true;
 }
@@ -102,7 +102,7 @@ inline size_t ReduceSumQuantImpl(const In *input_data, const Shape &input_shape,
   {
     size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
     size_t output_offset =
-        ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
+      ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
     temp_sum[output_offset] = reducer(temp_sum[output_offset], input_data[input_offset]);
   } while (NextIndex(input_num_dims, input_dims, input_iter));
   return normalizer;
@@ -185,8 +185,8 @@ public:
     }
 
     size_t normalizer =
-        ReduceSumQuantImpl<In>(input_data, input_shape, resolved_axis_data(), num_resolved_axis,
-                               temp_index_data(), reducer, _temp_sum.data());
+      ReduceSumQuantImpl<In>(input_data, input_shape, resolved_axis_data(), num_resolved_axis,
+                             temp_index_data(), reducer, _temp_sum.data());
     if (num_outputs > 0)
     {
       float scale = input_scale / output_scale;
@@ -231,6 +231,37 @@ void MeanQ8Asymm(const Shape &input_shape, const In *input_data, float input_sca
                           sum_reducer);
 }
 
+template <typename In, typename Out>
+void MeanAxis1And2(const Shape &input_shape, const In *input_data, const Shape &output_shape,
+                   Out *output_data)
+{
+  UNUSED_RELEASE(output_shape);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  const int output_batch = output_shape.Dims(0);
+  const int output_depth = output_shape.Dims(3);
+
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+
+  for (int out_b = 0; out_b < output_batch; ++out_b)
+  {
+    for (int out_d = 0; out_d < output_depth; ++out_d)
+    {
+      float value = 0;
+      for (int in_h = 0; in_h < input_height; ++in_h)
+      {
+        for (int in_w = 0; in_w < input_width; ++in_w)
+        {
+          value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
+        }
+      }
+      output_data[Offset(output_shape, out_b, 0, 0, out_d)] = value / (input_width * input_height);
+    }
+  }
+}
+
 } // namespace cker
 } // namespace nnfw
 
diff --git a/compute/cker/include/cker/operation/ResizeBilinear.h b/compute/cker/include/cker/operation/ResizeBilinear.h
index 7fc1e9123..8d9a7495f 100644
--- a/compute/cker/include/cker/operation/ResizeBilinear.h
+++ b/compute/cker/include/cker/operation/ResizeBilinear.h
@@ -62,7 +62,7 @@ inline void ResizeBilinearKernel2x2(int32_t x0, int32_t x1, int32_t y0, int32_t
 
     // Bottom right corner.
     output_data[output_offset + output_x_offset + output_y_offset] =
-        (output + ((x1y0 + x1y1) / 2)) / 2;
+      (output + ((x1y0 + x1y1) / 2)) / 2;
   }
 }
 
@@ -192,8 +192,8 @@ inline void ResizeBilinearGenericSmallChannel(int32_t batches, int32_t input_hei
                                    &x1);
 
         int32_t input_offset[4] = {
-            Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0),
-            Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)};
+          Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0),
+          Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)};
         float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)),
                           (1 - (input_y - y0)) * (input_x - x0),
                           (input_y - y0) * (1 - (input_x - x0)), (input_y - y0) * (input_x - x0)};
@@ -202,8 +202,8 @@ inline void ResizeBilinearGenericSmallChannel(int32_t batches, int32_t input_hei
         {
           const T *input_ptr = &input_data[d];
           *output_ptr++ = static_cast<T>(
-              input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] +
-              input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]);
+            input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] +
+            input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]);
         }
       }
     }
@@ -253,16 +253,16 @@ void ResizeBilinear(ResizeBilinearParams &params, const Shape &input_shape,
   int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
 
   float height_scale = (params.align_corners && params.output_height > 1)
-                           ? (static_cast<float>(input_height - 1) / (params.output_height - 1))
-                           : (static_cast<float>(input_height) / params.output_height);
+                         ? (static_cast<float>(input_height - 1) / (params.output_height - 1))
+                         : (static_cast<float>(input_height) / params.output_height);
 
   float width_scale = (params.align_corners && params.output_width > 1)
-                          ? (static_cast<float>(input_width - 1) / (params.output_width - 1))
-                          : (static_cast<float>(input_width) / params.output_width);
+                        ? (static_cast<float>(input_width - 1) / (params.output_width - 1))
+                        : (static_cast<float>(input_width) / params.output_width);
 
   ResizeBilinearGenericSmallChannel<uint8_t>(
-      batches, input_height, input_width, depth, params.output_height, params.output_width,
-      height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers);
+    batches, input_height, input_width, depth, params.output_height, params.output_width,
+    height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers);
 }
 } // namespace cker
 } // namespace nnfw
diff --git a/compute/cker/include/cker/operation/Select.h b/compute/cker/include/cker/operation/Select.h
index ab2de94cc..644fe0a0e 100644
--- a/compute/cker/include/cker/operation/Select.h
+++ b/compute/cker/include/cker/operation/Select.h
@@ -34,7 +34,7 @@ void Select(const Shape &input_condition_shape, const D *input_condition_data,
             const T *input_y_data, const Shape &output_shape, T *output_data)
 {
   const int64_t flatsize =
-      MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape);
+    MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape);
   for (int64_t i = 0; i < flatsize; ++i)
   {
     output_data[i] = (input_condition_data[i] != 0) ? input_x_data[i] : input_y_data[i];
@@ -101,7 +101,7 @@ void BroadcastSelect4DSlow(const Shape &input_condition_shape, const D *input_co
           const int x_index = SubscriptToIndex(desc_x, b, y, x, c);
           const int y_index = SubscriptToIndex(desc_y, b, y, x, c);
           output_data[Offset(extended_output_shape, b, y, x, c)] =
-              input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index];
+            input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index];
         }
       }
     }
diff --git a/compute/cker/include/cker/operation/Slice.h b/compute/cker/include/cker/operation/Slice.h
index a072cff8e..ef97fd5d8 100644
--- a/compute/cker/include/cker/operation/Slice.h
+++ b/compute/cker/include/cker/operation/Slice.h
@@ -43,16 +43,16 @@ inline void Slice(const SliceParams &op_params, const Shape &input_shape,
                                                                      : start_b + op_params.size[0];
   const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3];
   const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1)
-                         ? input_shape.Dims(1)
-                         : start_h + op_params.size[size_count - 3];
+                       ? input_shape.Dims(1)
+                       : start_h + op_params.size[size_count - 3];
   const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2];
   const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1)
-                         ? input_shape.Dims(2)
-                         : start_w + op_params.size[size_count - 2];
+                       ? input_shape.Dims(2)
+                       : start_w + op_params.size[size_count - 2];
   const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1];
   const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1)
-                         ? input_shape.Dims(3)
-                         : start_d + op_params.size[size_count - 1];
+                       ? input_shape.Dims(3)
+                       : start_d + op_params.size[size_count - 1];
 
   for (int in_b = start_b; in_b < stop_b; ++in_b)
   {
diff --git a/compute/cker/include/cker/operation/SoftMax.h b/compute/cker/include/cker/operation/SoftMax.h
index 0e0f364ba..620c1f968 100644
--- a/compute/cker/include/cker/operation/SoftMax.h
+++ b/compute/cker/include/cker/operation/SoftMax.h
@@ -65,7 +65,7 @@ inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const
     for (int c = 0; c < depth; ++c)
     {
       output_data[i * depth + c] =
-          std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta)) / sum;
+        std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta)) / sum;
     }
   }
 }
@@ -163,11 +163,11 @@ inline void Softmax(const SoftmaxParams &params, const Shape &input_shape,
       if (input_diff >= diff_min)
       {
         const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
-            input_diff, input_beta_multiplier, input_beta_left_shift);
+          input_diff, input_beta_multiplier, input_beta_left_shift);
         const FixedPointScaledDiff scaled_diff_f8 =
-            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+          FixedPointScaledDiff::FromRaw(input_diff_rescaled);
         sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
-                                        exp_on_negative_values(scaled_diff_f8));
+                                      exp_on_negative_values(scaled_diff_f8));
       }
     }
 
@@ -178,11 +178,11 @@ inline void Softmax(const SoftmaxParams &params, const Shape &input_shape,
     // no later adjustment will be needed.
     int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
     int32_t shifted_sum_minus_one =
-        static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) -
-                             (static_cast<uint32_t>(1) << 31));
+      static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) -
+                           (static_cast<uint32_t>(1) << 31));
 
     FixedPoint0 shifted_scale =
-        one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one));
+      one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one));
 
     for (int c = 0; c < depth; ++c)
     {
@@ -190,16 +190,16 @@ inline void Softmax(const SoftmaxParams &params, const Shape &input_shape,
       if (input_diff >= diff_min)
       {
         const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
-            input_diff, input_beta_multiplier, input_beta_left_shift);
+          input_diff, input_beta_multiplier, input_beta_left_shift);
         const FixedPointScaledDiff scaled_diff_f8 =
-            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+          FixedPointScaledDiff::FromRaw(input_diff_rescaled);
 
         FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
         int32_t unsat_output = gemmlowp::RoundingDivideByPOT((shifted_scale * exp_in_0).raw(),
                                                              num_bits_over_unit + 31 - 8);
 
         output_data[i * depth + c] = static_cast<uint8_t>(
-            std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0)));
+          std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0)));
       }
       else
       {
diff --git a/compute/cker/include/cker/operation/SpaceToBatchND.h b/compute/cker/include/cker/operation/SpaceToBatchND.h
index feeb358c9..aff36e2f3 100644
--- a/compute/cker/include/cker/operation/SpaceToBatchND.h
+++ b/compute/cker/include/cker/operation/SpaceToBatchND.h
@@ -79,9 +79,9 @@ inline void SpaceToBatchND(const SpaceToBatchParams &params, const Shape &unexte
         else
         {
           const T *in =
-              input_data + Offset(input_shape, input_batch,
-                                  (out_h * block_shape_height + shift_h) - padding_top,
-                                  (out_w * block_shape_width + shift_w) - padding_left, 0);
+            input_data + Offset(input_shape, input_batch,
+                                (out_h * block_shape_height + shift_h) - padding_top,
+                                (out_w * block_shape_width + shift_w) - padding_left, 0);
           memcpy(out, in, depth * sizeof(T));
         }
       }
diff --git a/compute/cker/include/cker/operation/StatelessRandomUniform.h b/compute/cker/include/cker/operation/StatelessRandomUniform.h
index d5952ae23..cdd812a08 100644
--- a/compute/cker/include/cker/operation/StatelessRandomUniform.h
+++ b/compute/cker/include/cker/operation/StatelessRandomUniform.h
@@ -95,7 +95,7 @@ inline void StatelessRandomUniform(const Shape &shape_shape, const int *shape_da
   GenerateKey(seed_t, &key, &counter);
 
   Fill<Eigen::ThreadPoolDevice, random::UniformDistribution<random::PhiloxRandom, float>>(
-      random::PhiloxRandom(counter, key), &output_t);
+    random::PhiloxRandom(counter, key), &output_t);
 }
 } // namespace cker
 } // namespace nnfw
diff --git a/compute/cker/include/cker/operation/Tile.h b/compute/cker/include/cker/operation/Tile.h
index 1dcdd9b79..42433468a 100644
--- a/compute/cker/include/cker/operation/Tile.h
+++ b/compute/cker/include/cker/operation/Tile.h
@@ -55,7 +55,7 @@ std::pair<int, int> TileOneDimension(const Shape &in_dimensions, const T *in_dat
   {
     int stride_size = 0, tiled_stride_size = 0;
     std::tie(stride_size, tiled_stride_size) =
-        TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1);
+      TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1);
     copy_from_data += stride_size;
     copy_to_data += tiled_stride_size;
     total_stride_size += stride_size;
diff --git a/compute/cker/include/cker/operation/Transpose.h b/compute/cker/include/cker/operation/Transpose.h
index 9d8cd340d..62eb432ae 100644
--- a/compute/cker/include/cker/operation/Transpose.h
+++ b/compute/cker/include/cker/operation/Transpose.h
@@ -555,9 +555,9 @@ void Transpose(const TransposeParams &unshrunk_params, const Shape &unshrunk_inp
     const int total_size = shrunk_input_shape.FlatSize();
 
     const int non_flatten_size =
-        Flatten(shrunk_input_shape, shrunk_output_shape, shrunk_params,
+      Flatten(shrunk_input_shape, shrunk_output_shape, shrunk_params,
 
-                &non_flatten_input_shape, &non_flatten_output_shape, &non_flatten_params);
+              &non_flatten_input_shape, &non_flatten_output_shape, &non_flatten_params);
     assert(non_flatten_params.perm[0] != 0);
 
     for (int i = 0; i < total_size; i += non_flatten_size)
diff --git a/compute/cker/include/cker/operation/TransposeConv.h b/compute/cker/include/cker/operation/TransposeConv.h
index 7db3a1179..d41f86047 100644
--- a/compute/cker/include/cker/operation/TransposeConv.h
+++ b/compute/cker/include/cker/operation/TransposeConv.h
@@ -90,11 +90,11 @@ inline void TransposeConv(const TransposeConvParams &params, const Shape &input_
                     (out_y < output_height))
                 {
                   float input_value =
-                      input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
-                  float filter_value = filter_data[Offset(filter_shape, out_channel, filter_y,
-                                                          filter_x, in_channel)];
+                    input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
+                  float filter_value =
+                    filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)];
                   output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] +=
-                      input_value * filter_value;
+                    input_value * filter_value;
                 }
               }
             }
diff --git a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
index 912b01a64..8c1d31b56 100644
--- a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
@@ -130,12 +130,12 @@ inline int32_t quant8_sum(const BinaryArithmeticOpParam &params, const uint8_t i
   const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
   const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
   const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-      shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    shifted_input1_val, params.input1_multiplier, params.input1_shift);
   const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-      shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    shifted_input2_val, params.input2_multiplier, params.input2_shift);
   const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
   const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                                 raw_sum, params.output_multiplier, params.output_shift) +
+                               raw_sum, params.output_multiplier, params.output_shift) +
                              params.output_offset;
   const int32_t clamped_output = std::min(params.quantized_activation_max,
                                           std::max(params.quantized_activation_min, raw_output));
@@ -192,9 +192,9 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
     const int16x4_t s1_narrowed = vmovn_s32(s1);
     const int16x4_t s2_narrowed = vmovn_s32(s2);
     const int16x8_t s =
-        vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset));
-    const uint8x8_t clamped = vmax_u8(output_activation_min_vector,
-                                      vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
+      vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset));
+    const uint8x8_t clamped =
+      vmax_u8(output_activation_min_vector, vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
     vst1_u8(output_data + i, clamped);
   }
 #endif // NEON
@@ -205,12 +205,12 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
     const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
     const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
     const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-        shifted_input1_val, params.input1_multiplier, params.input1_shift);
+      shifted_input1_val, params.input1_multiplier, params.input1_shift);
     const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-        shifted_input2_val, params.input2_multiplier, params.input2_shift);
+      shifted_input2_val, params.input2_multiplier, params.input2_shift);
     const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
     const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                                   raw_sum, params.output_multiplier, params.output_shift) +
+                                 raw_sum, params.output_multiplier, params.output_shift) +
                                params.output_offset;
     const int32_t clamped_output = std::min(params.quantized_activation_max,
                                             std::max(params.quantized_activation_min, raw_output));
@@ -387,7 +387,7 @@ inline void BinaryOpElementwise(int size, const BinaryArithmeticOpParam &params,
     auto a2 = vld1q_f32(input2_data + i);
     auto x = OPERATOR::calculate(a1, a2); // vaddq
     auto x_clamped =
-        ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
+      ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
     vst1q_f32(output_data + i, x_clamped);
   }
 #endif // USE_NEON
@@ -395,7 +395,7 @@ inline void BinaryOpElementwise(int size, const BinaryArithmeticOpParam &params,
   {
     auto x = OPERATOR::calculate(input1_data[i], input2_data[i]);
     output_data[i] = ACTIVATION::applyCeiling(
-        ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
+      ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
   }
 }
 
@@ -441,7 +441,7 @@ inline void BinaryOpScalarBroadcast(int size, const BinaryArithmeticOpParam &par
     auto a2 = vld1q_f32(input2_data + i);
     auto x = OPERATOR::calculate(broadcast_value_dup, a2);
     auto x_clamped =
-        ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
+      ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
     vst1q_f32(output_data + i, x_clamped);
   }
 #endif // USE_NEON
@@ -449,13 +449,13 @@ inline void BinaryOpScalarBroadcast(int size, const BinaryArithmeticOpParam &par
   {
     auto x = OPERATOR::calculate(broadcast_value, input2_data[i]);
     output_data[i] = ACTIVATION::applyCeiling(
-        ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
+      ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
   }
 }
 
 using BinaryOpImplFloatFuncs =
-    std::pair<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *, float *),
-              void (*)(int, const BinaryArithmeticOpParam &, const float, const float *, float *)>;
+  std::pair<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *, float *),
+            void (*)(int, const BinaryArithmeticOpParam &, const float, const float *, float *)>;
 
 template <class FUNC>
 inline BinaryOpImplFloatFuncs
@@ -514,23 +514,22 @@ inline void BroadcastAddDispatchQuant8(const BinaryArithmeticOpParam &params,
   if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
   {
     const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)>
-        fn = [](const BinaryArithmeticOpParam &params, const uint8_t &a,
-                const uint8_t &b) -> uint8_t {
+      fn =
+        [](const BinaryArithmeticOpParam &params, const uint8_t &a, const uint8_t &b) -> uint8_t {
       return static_cast<uint8_t>(quant8_sum(params, a, b));
     };
-    reference::BroadcastBinaryArithmeticOpSlowQuant8(params, input1_shape, input1_data,
-                                                     input2_shape, input2_data, output_shape,
-                                                     output_data, fn);
+    reference::BroadcastBinaryArithmeticOpSlowQuant8(
+      params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn);
   }
   else
   {
     BinaryBroadcastFiveFold(
-        params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
-        input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-        static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
-                             uint8_t *)>(AddElementwiseQuant8),
-        static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
-                             uint8_t *)>(AddScalarBroadcastQuant8));
+      params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
+      input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+      static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
+                           uint8_t *)>(AddElementwiseQuant8),
+      static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
+                           uint8_t *)>(AddScalarBroadcastQuant8));
   }
 }
 
@@ -542,7 +541,7 @@ inline void BroadcastAddDispatch(const BinaryArithmeticOpParam &params, const Sh
   if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
   {
     const std::function<float(const float &, const float &)> fn =
-        [](const float &a, const float &b) -> float { return a + b; };
+      [](const float &a, const float &b) -> float { return a + b; };
     reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
                                                input2_data, output_shape, output_data, fn);
   }
@@ -550,10 +549,10 @@ inline void BroadcastAddDispatch(const BinaryArithmeticOpParam &params, const Sh
   {
     auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncAddFloat>(params);
 
-    BinaryBroadcastFiveFold(params, params.broadcast_category ==
-                                        BroadcastableOpCategory::kSecondInputBroadcastsFast,
-                            input1_shape, input1_data, input2_shape, input2_data, output_shape,
-                            output_data, implFuncs.first, implFuncs.second);
+    BinaryBroadcastFiveFold(
+      params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
+      input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+      implFuncs.first, implFuncs.second);
   }
 }
 
@@ -580,14 +579,14 @@ inline void BroadcastSubDispatch(const BinaryArithmeticOpParam &params, const Sh
   else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast)
   {
     auto implFuncs =
-        getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncSubFloat>>(params);
+      getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncSubFloat>>(params);
     BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data,
                             output_shape, output_data, implFuncs.first, implFuncs.second);
   }
   else
   {
     const std::function<float(const float &, const float &)> fn =
-        [](const float &a, const float &b) -> float { return a - b; };
+      [](const float &a, const float &b) -> float { return a - b; };
     reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
                                                input2_data, output_shape, output_data, fn);
   }
@@ -599,11 +598,11 @@ inline int32_t quant8_mul(const BinaryArithmeticOpParam &params, const uint8_t i
   const int32_t input1_val = params.input1_offset + input1_data;
   const int32_t input2_val = params.input2_offset + input2_data;
   const int32_t unclamped_result =
-      params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
-                                                           params.output_multiplier,
-                                                           params.output_shift);
+    params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                                         params.output_multiplier,
+                                                         params.output_shift);
   const int32_t clamped_output = std::min(
-      params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
+    params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
 
   return clamped_output;
 }
@@ -652,8 +651,8 @@ inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
     const auto p1_narrowed = vqmovn_s32(p1);
     const auto p2_narrowed = vqmovn_s32(p2);
     const auto p = vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
-    const auto clamped = vmax_u8(output_activation_min_vector,
-                                 vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
+    const auto clamped =
+      vmax_u8(output_activation_min_vector, vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
     vst1_u8(output_data + i, clamped);
   }
 #endif // NEON
@@ -663,12 +662,11 @@ inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
     const int32_t input1_val = params.input1_offset + input1_data[i];
     const int32_t input2_val = params.input2_offset + input2_data[i];
     const int32_t unclamped_result =
-        params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
-                                                             params.output_multiplier,
-                                                             params.output_shift);
-    const int32_t clamped_output =
-        std::min(params.quantized_activation_max,
-                 std::max(params.quantized_activation_min, unclamped_result));
+      params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                                           params.output_multiplier,
+                                                           params.output_shift);
+    const int32_t clamped_output = std::min(
+      params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
     output_data[i] = static_cast<uint8_t>(clamped_output);
   }
 }
@@ -711,22 +709,21 @@ inline void BroadcastMulDispatchQuant8(const BinaryArithmeticOpParam &params,
   if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
   {
     const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)>
-        fn = [](const BinaryArithmeticOpParam &params, const uint8_t &a,
-                const uint8_t &b) -> uint8_t {
+      fn =
+        [](const BinaryArithmeticOpParam &params, const uint8_t &a, const uint8_t &b) -> uint8_t {
       return static_cast<uint8_t>(quant8_mul(params, a, b));
     };
-    reference::BroadcastBinaryArithmeticOpSlowQuant8(params, input1_shape, input1_data,
-                                                     input2_shape, input2_data, output_shape,
-                                                     output_data, fn);
+    reference::BroadcastBinaryArithmeticOpSlowQuant8(
+      params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn);
     return;
   }
   BinaryBroadcastFiveFold(
-      params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
-      input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-      static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
-                           uint8_t *)>(MulElementwiseQuant8),
-      static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
-                           uint8_t *)>(MulSimpleBroadcastQuant8));
+    params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
+    input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+    static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
+                         uint8_t *)>(MulElementwiseQuant8),
+    static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
+                         uint8_t *)>(MulSimpleBroadcastQuant8));
 }
 
 inline void BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -738,16 +735,16 @@ inline void BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Sh
   {
     // TODO: Use GetBinaryArithmeticFn
     const std::function<float(const float &, const float &)> fn =
-        [](const float &a, const float &b) -> float { return a * b; };
+      [](const float &a, const float &b) -> float { return a * b; };
     reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
                                                input2_data, output_shape, output_data, fn);
     return;
   }
   auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params);
-  BinaryBroadcastFiveFold(params, params.broadcast_category ==
-                                      BroadcastableOpCategory::kSecondInputBroadcastsFast,
-                          input1_shape, input1_data, input2_shape, input2_data, output_shape,
-                          output_data, implFuncs.first, implFuncs.second);
+  BinaryBroadcastFiveFold(
+    params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
+    input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+    implFuncs.first, implFuncs.second);
 }
 
 inline void Div(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -760,7 +757,7 @@ inline void Div(const BinaryArithmeticOpParam &params, const Shape &input1_shape
   (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
 #else
   const std::function<float(const float &, const float &)> fn =
-      [](const float &a, const float &b) -> float { return a / b; };
+    [](const float &a, const float &b) -> float { return a / b; };
   reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data,
                                 output_shape, output_data, fn);
 #endif // __aarch64__
@@ -781,7 +778,7 @@ inline void BroadcastDivDispatch(const BinaryArithmeticOpParam &params, const Sh
   else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast)
   {
     auto implFuncs =
-        getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncDivFloat>>(params);
+      getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncDivFloat>>(params);
     BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data,
                             output_shape, output_data, implFuncs.first, implFuncs.second);
   }
@@ -789,7 +786,7 @@ inline void BroadcastDivDispatch(const BinaryArithmeticOpParam &params, const Sh
 #endif // __aarch64__
   {
     const std::function<float(const float &, const float &)> fn =
-        [](const float &a, const float &b) -> float { return a / b; };
+      [](const float &a, const float &b) -> float { return a / b; };
     reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
                                                input2_data, output_shape, output_data, fn);
   }
diff --git a/compute/cker/include/cker/operation/optimized/Conv.h b/compute/cker/include/cker/operation/optimized/Conv.h
index 0f620146c..26fc443db 100644
--- a/compute/cker/include/cker/operation/optimized/Conv.h
+++ b/compute/cker/include/cker/operation/optimized/Conv.h
@@ -48,7 +48,7 @@ struct GemmlowpOutputPipeline
   typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
                      gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
                      gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8>
-      Pipeline;
+    Pipeline;
   static Pipeline MakeExp(const int32_t *bias_data, int output_rows, int32_t output_offset,
                           int32_t output_multiplier, int output_left_shift,
                           int32_t output_activation_min, int32_t output_activation_max)
@@ -106,7 +106,7 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
   const int filter_height = filter_shape.Dims(1);
   const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1;
   const bool need_im2col =
-      stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1;
+    stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1;
   if (need_dilated_im2col)
   {
     assert(im2col_data);
@@ -141,7 +141,7 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
   // the other calls commented out. This is a partial rollback of cl/196819423.
   // const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3);
   const int gemm_input_cols =
-      gemm_input_shape->Dims(0) * gemm_input_shape->Dims(1) * gemm_input_shape->Dims(2);
+    gemm_input_shape->Dims(0) * gemm_input_shape->Dims(1) * gemm_input_shape->Dims(2);
   const int filter_rows = filter_shape.Dims(0);
   // See b/79927784.
   // const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
@@ -156,17 +156,17 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
   assert(bias_shape.FlatSize() == output_rows);
   UNUSED_RELEASE(bias_shape);
   gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> filter_matrix(
-      filter_data, filter_rows, filter_cols);
+    filter_data, filter_rows, filter_cols);
   gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> input_matrix(
-      gemm_input_data, gemm_input_rows, gemm_input_cols);
+    gemm_input_data, gemm_input_rows, gemm_input_cols);
   gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor> output_matrix(output_data, output_rows,
                                                                            output_cols);
   const auto &output_pipeline =
-      GemmlowpOutputPipeline::MakeExp(bias_data, output_rows, output_offset, output_multiplier,
-                                      output_shift, output_activation_min, output_activation_max);
+    GemmlowpOutputPipeline::MakeExp(bias_data, output_rows, output_offset, output_multiplier,
+                                    output_shift, output_activation_min, output_activation_max);
   gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
-      gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset,
-      output_pipeline);
+    gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset,
+    output_pipeline);
 }
 
 } // namespace optimized
@@ -202,10 +202,10 @@ public:
                   T *output_data, int output_height, int output_width)
   {
     const bool is_1x1_kernel =
-        (filter_height == 1 && filter_width == 1 && stride_rows == 1 && stride_cols == 1);
+      (filter_height == 1 && filter_width == 1 && stride_rows == 1 && stride_cols == 1);
     const bool is_same_height_width =
-        (filter_height == input_height && filter_width == input_width && pad_width == 0 &&
-         pad_height == 0);
+      (filter_height == input_height && filter_width == input_width && pad_width == 0 &&
+       pad_height == 0);
     if (is_1x1_kernel || is_same_height_width)
     {
       // is_1x1_kernel: For 1x1 kernel, the 2D convolution is reduced to matrix multiplication.
diff --git a/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h
new file mode 100644
index 000000000..d4397933a
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h
@@ -0,0 +1,1250 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_FLOAT_H__
+#define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_FLOAT_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/neon/neon_check.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized
+{
+
+// Implementation of float DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct FloatDepthwiseConvKernel
+{
+};
+
+#ifdef USE_NEON
+
+template <> struct FloatDepthwiseConvKernel<false, 8, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    (void)input_ptr_increment;
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++)
+      {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
+      acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
+      acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
+      acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<false, 2, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+    (void)input_ptr_increment;
+
+    const float32x2_t filters = vld1_f32(filter_ptr);
+    const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8)
+    {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++)
+      {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the inputs
+      const float32x4_t input = vld1q_f32(input_ptr);
+      input_ptr += 4;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filters_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      const float32x2_t input = vld1_f32(input_ptr);
+      input_ptr += 2;
+      // Load the accumulators from acc_buffer
+      float32x2_t acc = vld1_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmla_f32(acc, input, filters);
+      // Store the accumulators back to acc_buffer
+      vst1_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 0, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)depth_multiplier;
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const float *local_filter_ptr = filter_ptr;
+      const float *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16)
+      {
+        // Load the filters
+        float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0);
+        float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1);
+        float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2);
+        float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3);
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0);
+        float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1);
+        float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2);
+        float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3);
+        local_input_ptr += 16;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+        float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+        float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+        float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+        // Multiply-accumulate
+        acc_0 = vmlaq_f32(acc_0, input_0, filter_0);
+        acc_1 = vmlaq_f32(acc_1, input_1, filter_1);
+        acc_2 = vmlaq_f32(acc_2, input_2, filter_2);
+        acc_3 = vmlaq_f32(acc_3, input_3, filter_3);
+        // Store the accumulators back to acc_buffer
+        vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+        vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+        vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+        vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4)
+      {
+        // Load the filters
+        float32x4_t filter;
+        filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        float32x4_t input;
+        input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc;
+        acc = vld1q_f32(acc_buffer_ptr);
+        // Multiply-accumulate
+        acc = vmlaq_f32(acc, input, filter);
+        // Store the accumulators back to acc_buffer
+        vst1q_f32(acc_buffer_ptr, acc);
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        const float input_val = *local_input_ptr++;
+        const float filter_val = *local_filter_ptr++;
+        *acc_buffer_ptr++ += filter_val * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 0, 8>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)depth_multiplier;
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const float *local_filter_ptr = filter_ptr;
+      const float *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2)
+      {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++)
+        {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++)
+        {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
+        acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
+        acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
+        acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++)
+        {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        // Load the filters
+        float32x4_t filter[2];
+        for (int i = 0; i < 2; i++)
+        {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++)
+        {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+// Note this implementation is very slow for input_depths < 8
+// (e.g. comparable to reference implementation) see, specializations for
+// input_depth=3 below.
+template <> struct FloatDepthwiseConvKernel<true, 0, 2>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)depth_multiplier;
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const float *local_filter_ptr = filter_ptr;
+      const float *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8)
+      {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++)
+        {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4x2_t input_dup2[2];
+        for (int i = 0; i < 2; i++)
+        {
+          const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
+          input_dup2[i] = vzipq_f32(input, input);
+        }
+        local_input_ptr += 8;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++)
+        {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
+        acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
+        acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
+        acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++)
+        {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4)
+      {
+        // Load the filters
+        float32x2_t filter[4];
+        for (int i = 0; i < 4; i++)
+        {
+          filter[i] = vld1_f32(local_filter_ptr + 2 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float32x4_t input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[4];
+        for (int i = 0; i < 4; i++)
+        {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
+        acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
+        acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
+        acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++)
+        {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2)
+      {
+        // Load the filters
+        const float32x4_t filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
+        acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++)
+        {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++)
+        {
+          acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
+        }
+        local_filter_ptr += 2;
+        acc_buffer_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 3, 2>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x2_t filter[3];
+    for (int i = 0; i < 3; i++)
+    {
+      filter[i] = vld1_f32(filter_ptr + 2 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const float32x2_t input01 = vld1_f32(input_ptr);
+      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+      // Load the accumulators from acc_buffer
+      float32x2_t acc[3];
+      for (int i = 0; i < 3; i++)
+      {
+        acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+      }
+      // Multiply-accumulate for each input channel there 2 outputs
+      acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0);
+      acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1);
+      acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 3; i++)
+      {
+        vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+      }
+      acc_buffer_ptr += 6;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 3, 4>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x4_t filter[3];
+    for (int i = 0; i < 3; i++)
+    {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // NOTE: we only want 3 values, so we read it as two ops where
+      // the second op just duplicates the lane
+      const float32x2_t input01 = vld1_f32(input_ptr);
+      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[3];
+      for (int i = 0; i < 3; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate all outputs.
+      acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0);
+      acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1);
+      acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 3; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 12;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 1, 8>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 1, 32>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
+    float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
+    float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
+    float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
+    float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
+    float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5);
+    float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6);
+    float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+      float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+      float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+      float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+      float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
+      float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5);
+      float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6);
+      float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7);
+      // Multiply-accumulate
+      acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
+      acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
+      acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
+      acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
+      acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
+      acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val);
+      acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val);
+      acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
+      vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5);
+      vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6);
+      vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7);
+      acc_buffer_ptr += 32;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 1, 20>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
+    float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
+    float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
+    float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
+    float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+      float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+      float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+      float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+      float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
+      // Multiply-accumulate
+      acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
+      acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
+      acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
+      acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
+      acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
+      acc_buffer_ptr += 20;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 0, 16>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)depth_multiplier;
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const float *local_filter_ptr = filter_ptr;
+      const float *local_input_ptr = input_ptr;
+      for (int ic = 0; ic < input_depth; ic++)
+      {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++)
+        {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++)
+        {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 4; i++)
+        {
+          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++)
+        {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 8, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 2, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    float32x2_t filter = vld1_f32(filter_ptr);
+    float32x4_t filter_x4 = vcombine_f32(filter, filter);
+    int outp = 0;
+
+    // Handle two output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the inputs
+      float32x2_t input_1 = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+      float32x2_t input_2 = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+      float32x4_t input = vcombine_f32(input_1, input_2);
+
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filter_x4);
+
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      float32x2_t input = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+
+      // Load the accumulators from acc_buffer
+      float32x2_t acc = vld1_f32(acc_buffer_ptr);
+
+      // Multiply-accumulate
+      acc = vmla_f32(acc, input, filter);
+
+      // Store the accumulators back to acc_buffer
+      vst1_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 4, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+                  float *acc_buffer_ptr)
+  {
+    (void)input_depth;
+    (void)depth_multiplier;
+
+    float32x4_t filter = vld1q_f32(filter_ptr);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs
+      float32x4_t input = vld1q_f32(input_ptr);
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filter);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void FloatDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth, int input_width,
+                                const float *input_data, int pad_width, int depth_multiplier,
+                                int filter_width, const float *filter_data, int out_x_buffer_start,
+                                int out_x_buffer_end, int output_depth, float *acc_buffer)
+{
+  // Sanity check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  assert(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth)
+  {
+    assert(input_depth == kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier)
+  {
+    assert(depth_multiplier == kFixedDepthMultiplier);
+  }
+  assert(output_depth == input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const float *filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+  {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclamped = 0;
+    int out_x_loop_end_unclamped = 0;
+    if (kAllowStrided)
+    {
+      if (stride == 2)
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 1) / 2;
+        out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
+      }
+      else if (stride == 4)
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 3) / 4;
+        out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
+      }
+      else
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclamped =
+          (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
+      }
+    }
+    else
+    {
+      out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclamped = pad_width + input_width - dilation_factor * filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclamped);
+    const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclamped);
+
+    float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const float *input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
+      num_output_pixels, input_depth, depth_multiplier, input_ptr, input_ptr_increment,
+      filter_base_ptr, acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
+inline void FloatDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth,
+                                              int input_width, const float *input_data,
+                                              int pad_width, int depth_multiplier, int filter_width,
+                                              const float *filter_data, int out_x_buffer_start,
+                                              int out_x_buffer_end, int output_depth,
+                                              float *acc_buffer)
+{
+  const float *filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+  {
+    const int out_x_loop_start =
+      std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+    const int out_x_loop_end =
+      std::min(out_x_buffer_end,
+               (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
+
+    float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const float *input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
+    {
+      const float *filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic)
+      {
+        const float input_val = *input_ptr++;
+        for (int m = 0; m < depth_multiplier; m++)
+        {
+          const float filter_val = *filter_ptr++;
+          *acc_buffer_ptr++ += filter_val * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const float *bias_data, float *acc_buffer)
+{
+  // TODO(benoitjacob): This might need optimized specializations
+  // for small output_depth values, if that ever becomes an important
+  // case (like it was for some quantized DepthwiseConv cases).
+  for (int i = 0; i < num_output_pixels; i++)
+  {
+    memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+// DepthwiseConv can run with multi threads on the dim specified by thread_dim.
+// Each thread processes output elements on dim, thread_dim, in the range of
+// [thread_start, thread_end).
+// For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
+// means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
+inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const Shape &input_shape,
+                              const float *input_data, const Shape &filter_shape,
+                              const float *filter_data, const Shape &bias_shape,
+                              const float *bias_data, const Shape &output_shape, float *output_data,
+                              int thread_start, int thread_end, int thread_dim)
+{
+  UNUSED_RELEASE(bias_shape);
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  assert(thread_dim == 0 || thread_dim == 1);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  assert(output_depth == input_depth * depth_multiplier);
+  assert(bias_shape.FlatSize() == output_depth);
+
+  static const int kAccBufferMaxSize = 4832;
+  float acc_buffer[kAccBufferMaxSize];
+  assert(kAccBufferMaxSize >= output_depth);
+  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
+  assert(kAccBufferActualSize <= kAccBufferMaxSize);
+  assert(kOutputPixelsInAccBuffer >= 1);
+
+  UNUSED_RELEASE(kAccBufferActualSize);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric);
+  row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
+  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&                                  \
+      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&                             \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER)                                                 \
+  {                                                                                               \
+    row_accum_func =                                                                              \
+      FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>;       \
+  }
+
+#ifdef USE_NEON
+  // We go over our list of kernels by decreasing order of preference
+  // for the cases where multiple kernels could apply.
+
+  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+
+  // Next come the strided kernels: AllowStrided=true, fixed input depth.
+  // They are a bit less efficient, but allow stride!=1.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+  // Finally, the kernels allowing a variable input depth,
+  // these are the least efficient but most general kernels.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16)
+
+#endif // USE_NEON
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+  // No matching fast kernel found, use slow fallback.
+  if (!row_accum_func)
+  {
+    row_accum_func = FloatDepthwiseConvAccumRowGeneric;
+  }
+
+  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
+  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
+  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
+
+  // Now that we have determined row_accum_func, we can start work.
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = output_height;
+  int output_ptr_offset = 0;
+
+  switch (thread_dim)
+  {
+    case 0:
+      // Multithread along with the batch axis
+      assert(thread_start >= 0);
+      assert(thread_end <= batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
+      break;
+    case 1:
+      // Multithread along with the row axis
+      assert(thread_start >= 0);
+      assert(thread_end <= output_height);
+      row_start = thread_start;
+      row_end = thread_end;
+      output_ptr_offset = row_start * output_width * output_depth;
+      break;
+  }
+
+  float *output_ptr = output_data + output_ptr_offset;
+  int batch_step = (output_height + row_start - row_end) * output_width * output_depth;
+
+  for (int b = batch_start; b < batch_end; ++b)
+  {
+    for (int out_y = row_start; out_y < row_end; ++out_y)
+    {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      const int filter_y_start =
+        std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
+      const int filter_y_end =
+        std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
+                                  dilation_height_factor);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer)
+      {
+        const int out_x_buffer_end =
+          std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+        {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
+                         input_data + in_y * input_height_stride + b * input_batch_stride,
+                         pad_width, depth_multiplier, filter_width,
+                         filter_data + filter_y * filter_height_stride, out_x_buffer_start,
+                         out_x_buffer_end, output_depth, acc_buffer);
+        }
+        // Finished accumulating. Now store to destination.
+        const int num_output_values = output_depth * num_output_pixels;
+        int i = 0;
+// TODO(benoitjacob) optimized code goes here
+#ifdef USE_NEON
+        // Handle 16 values at a time
+        for (; i <= num_output_values - 16; i += 16)
+        {
+          float32x4_t acc[4];
+          for (int k = 0; k < 4; k++)
+          {
+            acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
+          }
+          for (int k = 0; k < 4; k++)
+          {
+            acc[k] = vmaxq_f32(vdupq_n_f32(output_activation_min),
+                               vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
+          }
+          for (int k = 0; k < 4; k++)
+          {
+            vst1q_f32(output_ptr + 4 * k, acc[k]);
+          }
+          output_ptr += 16;
+        }
+        // Handle 4 values at a time
+        for (; i <= num_output_values - 4; i += 4)
+        {
+          float32x4_t acc = vld1q_f32(acc_buffer + i);
+
+          acc = vmaxq_f32(vdupq_n_f32(output_activation_min),
+                          vminq_f32(vdupq_n_f32(output_activation_max), acc));
+
+          vst1q_f32(output_ptr, acc);
+          output_ptr += 4;
+        }
+#endif
+        // Handle leftover values, one by one. This is very slow.
+        for (; i < num_output_values; i++)
+        {
+          float acc = acc_buffer[i];
+          acc = std::max(output_activation_min, std::min(output_activation_max, acc));
+
+          *output_ptr++ = acc;
+        }
+      }
+    }
+    output_ptr += batch_step;
+  }
+}
+
+} // nnfw
+} // cker
+} // optimized
+
+#endif
diff --git a/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h b/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h
index d383b126d..5ca56fd09 100644
--- a/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h
+++ b/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h
@@ -32,6 +32,8 @@ namespace cker
 {
 namespace optimized
 {
+namespace depthwise_conv
+{
 
 // Implementation of quantized DepthwiseConv
 
@@ -44,8 +46,8 @@ struct QuantizedDepthwiseConvKernel
 template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -57,7 +59,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
     for (int i = 0; i < 2; i++)
     {
       filter[i] =
-          vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset));
+        vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset));
     }
     // Handle one output pixel at a time.
     for (int outp = 0; outp < num_output_pixels; outp++)
@@ -80,9 +82,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
       for (int i = 0; i < 2; i++)
       {
         acc[0].val[i] =
-            vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));
+          vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));
         acc[1].val[i] =
-            vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));
+          vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));
       }
       // Store the accumulators back to acc_buffer
       for (int i = 0; i < 2; i++)
@@ -98,8 +100,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
 template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -174,8 +176,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>
 template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -206,9 +208,9 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
       for (int i = 0; i < 2; i++)
       {
         acc[2 * i + 0] =
-            vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));
+          vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));
         acc[2 * i + 1] =
-            vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));
+          vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));
       }
       // Store the accumulators back to acc_buffer
       for (int i = 0; i < 4; i++)
@@ -253,8 +255,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
 template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -338,8 +340,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>
 template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -409,8 +411,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>
 template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -534,8 +536,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>
 template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -600,8 +602,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>
 template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -703,8 +705,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>
 template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -778,8 +780,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>
 template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -864,8 +866,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>
 template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -873,7 +875,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
     // We will do that by register-level table-look-up using VTBL instructions.
     // Here we prepare the registers containing the table-lookup indices.
     static const uint8_t dup3_indices_array[3][8] = {
-        {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};
+      {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};
     uint8x8_t dup3_indices[3];
     for (int i = 0; i < 3; i++)
     {
@@ -928,9 +930,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
         for (int j = 0; j < 3; j++)
         {
           acc[0].val[j] =
-              vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));
+            vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));
           acc[1].val[j] =
-              vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));
+            vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));
         }
         // Store the accumulators back to acc_buffer
         for (int i = 0; i < 2; i++)
@@ -944,10 +946,10 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
       // Handle one input channel at a time.
       for (; ic < input_depth; ic++)
       {
-        const uint16_t input_val = *local_input_ptr++ + input_offset;
+        const int16_t input_val = *local_input_ptr++ + input_offset;
         for (int i = 0; i < 3; i++)
         {
-          const uint16_t filter_val = local_filter_ptr[i] + filter_offset;
+          const int16_t filter_val = local_filter_ptr[i] + filter_offset;
           *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
         }
         local_filter_ptr += 3;
@@ -960,8 +962,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
 template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1002,9 +1004,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
         for (int j = 0; j < 2; j++)
         {
           acc[0].val[j] =
-              vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));
+            vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));
           acc[1].val[j] =
-              vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));
+            vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));
         }
         // Store the accumulators back to acc_buffer.
         for (int i = 0; i < 2; i++)
@@ -1018,10 +1020,10 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
       for (; ic < input_depth; ic++)
       {
         // Load the inputs.
-        const uint16_t input_val = *local_input_ptr++ + input_offset;
+        const int16_t input_val = *local_input_ptr++ + input_offset;
         for (int i = 0; i < 2; i++)
         {
-          const uint16_t filter_val = local_filter_ptr[i] + filter_offset;
+          const int16_t filter_val = local_filter_ptr[i] + filter_offset;
           *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
         }
         local_filter_ptr += 2;
@@ -1034,8 +1036,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
 template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1112,8 +1114,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
       // Handle one input channel at a time.
       for (; ic < input_depth; ic++)
       {
-        const uint16_t input_val = *local_input_ptr++ + input_offset;
-        const uint16_t filter_val = *local_filter_ptr++ + filter_offset;
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        const int16_t filter_val = *local_filter_ptr++ + filter_offset;
         *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
       }
       input_ptr += input_ptr_increment;
@@ -1124,8 +1126,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
 template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1174,7 +1176,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
       {
         acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i]));
         acc[2 * i + 1] =
-            vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));
+          vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));
       }
       // Store the accumulators back to acc_buffer
       for (int i = 0; i < 4; i++)
@@ -1189,8 +1191,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
 template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1228,8 +1230,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>
 template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1253,7 +1255,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
     {
       uint8_t input_u8 = *input_ptr;
       input_ptr += input_ptr_increment;
-      uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      int16_t input = static_cast<int16_t>(input_u8) + input_offset;
       // Load the accumulators from acc_buffer
       int32x4_t acc[4];
       for (int i = 0; i < 4; i++)
@@ -1279,8 +1281,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
 template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1302,7 +1304,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
     {
       uint8_t input_u8 = *input_ptr;
       input_ptr += input_ptr_increment;
-      uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      int16_t input = static_cast<int16_t>(input_u8) + input_offset;
       // Load the accumulators from acc_buffer
       int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
       int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
@@ -1338,8 +1340,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
 template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1363,7 +1365,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
     {
       uint8_t input_u8 = *input_ptr;
       input_ptr += input_ptr_increment;
-      uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      int16_t input = static_cast<int16_t>(input_u8) + input_offset;
       // Load the accumulators from acc_buffer
       int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
       int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
@@ -1390,21 +1392,21 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
 template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
     // Load the filters, add filter_offset.
     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
     const int16x8_t filter =
-        vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
+      vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
     // Handle one output pixel at a time.
     for (int outp = 0; outp < num_output_pixels; outp++)
     {
       uint8_t input_u8 = *input_ptr;
       input_ptr += input_ptr_increment;
-      uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+      int16_t input = static_cast<int16_t>(input_u8) + input_offset;
       // Load the accumulators from acc_buffer
       int32x4_t acc[2];
       for (int i = 0; i < 2; i++)
@@ -1427,8 +1429,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>
 template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1455,7 +1457,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
       input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 1);
       input_ptr += input_ptr_increment;
       const int16x4_t input_s16 =
-          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 
       // Multiply-accumulate.
@@ -1490,8 +1492,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
 template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1555,8 +1557,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>
 template <> struct QuantizedDepthwiseConvKernel<false, 12, 1>
 {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
-                  const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+                  const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
   {
     (void)input_depth;
     (void)depth_multiplier;
@@ -1652,9 +1654,9 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_d
       else
       {
         out_x_loop_start_unclampled =
-            (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+          (pad_width - dilation_factor * filter_x + stride - 1) / stride;
         out_x_loop_end_unclampled =
-            (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
+          (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
       }
     }
     else
@@ -1672,8 +1674,8 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_d
     const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
     const int num_output_pixels = out_x_loop_end - out_x_loop_start;
     QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
-        num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
-        input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr);
+      num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
+      input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr);
     filter_base_ptr += output_depth;
   }
 }
@@ -1690,11 +1692,11 @@ inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_facto
   const uint8_t *filter_base_ptr = filter_data;
   for (int filter_x = 0; filter_x < filter_width; ++filter_x)
   {
-    const int out_x_loop_start = std::max(
-        out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+    const int out_x_loop_start =
+      std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
     const int out_x_loop_end =
-        std::min(out_x_buffer_end,
-                 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
+      std::min(out_x_buffer_end,
+               (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
 
     int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
     const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
@@ -1813,7 +1815,8 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
                                  const uint8_t *input_data, const Shape &filter_shape,
                                  const uint8_t *filter_data, const Shape &bias_shape,
                                  const int32_t *bias_data, const Shape &output_shape,
-                                 uint8_t *output_data)
+                                 uint8_t *output_data, int thread_start, int thread_end,
+                                 int thread_dim)
 {
   (void)bias_shape;
   const int stride_width = params.stride_width;
@@ -1852,6 +1855,8 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
   assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
   assert(kAccBufferActualSize <= kAccBufferMaxSize);
   assert(kOutputPixelsInAccBuffer >= 1);
+  assert(thread_dim == 0 || thread_dim == 1);
+
   UNUSED_RELEASE(kAccBufferActualSize);
 
   // row_accum_func will point to the core accumulation function to be used
@@ -1865,7 +1870,7 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
       depth_multiplier == FIXED_DEPTH_MULTIPLIER)                                                 \
   {                                                                                               \
     row_accum_func =                                                                              \
-        QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \
+      QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>;   \
   }
 
 #ifdef USE_NEON
@@ -1919,22 +1924,49 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
   const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
 
   // Now that we have determined row_accum_func, we can start work.
-  uint8_t *output_ptr = output_data;
-  for (int b = 0; b < batches; ++b)
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = output_height;
+  int output_ptr_offset = 0;
+
+  switch (thread_dim)
+  {
+    case 0:
+      // Multithread along with the batch axis
+      assert(thread_start >= 0);
+      assert(thread_end <= batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
+      break;
+    case 1:
+      // Multithread along with the row axis
+      assert(thread_start >= 0);
+      assert(thread_end <= output_height);
+      row_start = thread_start;
+      row_end = thread_end;
+      output_ptr_offset = row_start * output_width * output_depth;
+      break;
+  }
+
+  uint8_t *output_ptr = output_data + output_ptr_offset;
+  int batch_step = (output_height + row_start - row_end) * output_width * output_depth;
+  for (int b = batch_start; b < batch_end; ++b)
   {
-    for (int out_y = 0; out_y < output_height; ++out_y)
+    for (int out_y = row_start; out_y < row_end; ++out_y)
     {
       const int in_y_origin = (out_y * stride_height) - pad_height;
       const int filter_y_start =
-          std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
+        std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
       const int filter_y_end =
-          std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
-                                      dilation_height_factor);
+        std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
+                                  dilation_height_factor);
       for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
            out_x_buffer_start += kOutputPixelsInAccBuffer)
       {
         const int out_x_buffer_end =
-            std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+          std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
         // We call a 'pixel' a group of activation that share all but the
         // 'depth'/'channel' coordinate. num_output_pixels is the number of
         // output pixels that we will accumulate in this loop iteration.
@@ -1952,7 +1984,7 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
                          filter_data + filter_y * filter_height_stride, filter_offset,
                          out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
         }
-        // Finished accumulating int32 values. Now need to convert them to
+        // Finished accumulating int32_t values. Now need to convert them to
         // the final 8bit form and store them.
         const int num_output_values = output_depth * num_output_pixels;
         int i = 0;
@@ -2113,9 +2145,111 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
         }
       }
     }
+    output_ptr += batch_step;
   }
 }
 
+} // namespace depthwise_conv
+
+// template <DepthwiseConvOutputRounding kOutputRounding>
+inline void DepthwiseConvWithRounding(const DepthwiseConvParams &params, const Shape &input_shape,
+                                      const uint8_t *input_data, const Shape &filter_shape,
+                                      const uint8_t *filter_data, const Shape &bias_shape,
+                                      const int32_t *bias_data, const Shape &output_shape,
+                                      uint8_t *output_data, int thread_start, int thread_end,
+                                      int thread_dim)
+{
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  assert(dilation_width_factor >= 1);
+  assert(dilation_height_factor >= 1);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  assert(output_activation_min <= output_activation_max);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_depth = input_shape.Dims(3);
+  assert(output_depth == input_depth * depth_multiplier);
+  assert(bias_shape.FlatSize() == output_depth);
+
+  UNUSED_RELEASE(depth_multiplier);
+  UNUSED_RELEASE(output_activation_min);
+  UNUSED_RELEASE(output_activation_max);
+  UNUSED_RELEASE(dilation_width_factor);
+  UNUSED_RELEASE(dilation_height_factor);
+  UNUSED_RELEASE(output_depth);
+  UNUSED_RELEASE(input_depth);
+
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+//  TODO Use below codes
+//  // Dispatch to dot-product 3x3 kernels when supported.
+//
+//  ruy::Context *ruy_context = cpu_backend_context->ruy_context();
+//  const bool has_dot_product_instructions =
+//      ruy_context != nullptr &&
+//      (ruy_context->GetRuntimeEnabledPaths() & ruy::Path::kNeonDotprod) != ruy::Path::kNone;
+//  if (has_dot_product_instructions)
+//  {
+//    using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
+//    DotProduct3x3KernelType kernel_type =
+//    optimized_ops::depthwise_conv::CategorizeDotProductKernel(
+//        input_shape, filter_shape, params);
+//    if (kernel_type != DotProduct3x3KernelType::kNone)
+//    {
+//      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
+//          DepthwiseConvImplementation::kUseNeon3x3DotProduct>(params, input_shape, input_data,
+//                                                              filter_shape, filter_data,
+//                                                              bias_shape,
+//                                                              bias_data, output_shape,
+//                                                              output_data);
+//      return;
+//    }
+//  }
+//
+//  // Dispatch to non-dot-product 3x3 kernels when supported.
+//
+//  const int stride_width = params.stride_width;
+//  const int stride_height = params.stride_height;
+//  const int pad_width = params.padding_values.width;
+//  const int pad_height = params.padding_values.height;
+//  const int output_shift = params.output_shift;
+//
+//  // Call kernel optimized for depthwise convolutions using 3x3 filters if
+//  // parameters are supported.
+//  if (depthwise_conv::Fast3x3FilterKernelSupported(input_shape, filter_shape, stride_width,
+//                                                   stride_height, dilation_width_factor,
+//                                                   dilation_height_factor, pad_width, pad_height,
+//                                                   depth_multiplier, output_shape, output_shift))
+//  {
+//    depthwise_conv::DepthwiseConv3x3Filter<kOutputRounding>(
+//        params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
+//        output_shape, output_data, thread_start, thread_end, thread_dim);
+//    return;
+//  }
+#endif
+
+  depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data,
+                                       bias_shape, bias_data, output_shape, output_data,
+                                       thread_start, thread_end, thread_dim);
+}
+
+inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const Shape &input_shape,
+                              const uint8_t *input_data, const Shape &filter_shape,
+                              const uint8_t *filter_data, const Shape &bias_shape,
+                              const int32_t *bias_data, const Shape &output_shape,
+                              uint8_t *output_data, int thread_start, int thread_end,
+                              int thread_dim)
+{
+  return DepthwiseConvWithRounding(params, input_shape, input_data, filter_shape, filter_data,
+                                   bias_shape, bias_data, output_shape, output_data, thread_start,
+                                   thread_end, thread_dim);
+}
+
 } // namespace optimized
 } // namespace cker
 } // namespace nnfw
diff --git a/compute/cker/include/cker/operation/optimized/OptimizedUtils.h b/compute/cker/include/cker/operation/optimized/OptimizedUtils.h
index ae1f9e78e..f5edc94ab 100644
--- a/compute/cker/include/cker/operation/optimized/OptimizedUtils.h
+++ b/compute/cker/include/cker/operation/optimized/OptimizedUtils.h
@@ -111,7 +111,7 @@ inline void ExtractPatchIntoBufferColumn(const Shape &input_shape, int w, int h,
   {
     const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
     const int bottom_start =
-        output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
+      output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
     memset(conv_buffer_data + bottom_start, zero_byte, (bottom_row_elements * sizeof(T)));
   }
 }
@@ -159,7 +159,7 @@ void DilatedIm2col(const ConvParams &params, const Shape &input_shape, const T *
   for (int batch = 0; batch < batches; ++batch)
   {
     const T zero_byte =
-        zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]);
+      zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]);
     for (int out_y = 0; out_y < output_height; ++out_y)
     {
       for (int out_x = 0; out_x < output_width; ++out_x)
diff --git a/compute/cker/include/cker/operation/reference/BatchMatMul.h b/compute/cker/include/cker/operation/reference/BatchMatMul.h
index e8ffd4014..1b3020de2 100644
--- a/compute/cker/include/cker/operation/reference/BatchMatMul.h
+++ b/compute/cker/include/cker/operation/reference/BatchMatMul.h
@@ -87,9 +87,8 @@ inline void BatchMatMul(const Shape &lhs_shape, const float *lhs_data, const Sha
       {
         const float *lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
         const float *rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
-        float *out_ptr =
-            output_data +
-            ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) * lhs_rows * rhs_cols;
+        float *out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) *
+                                         lhs_rows * rhs_cols;
         for (int j = 0; j < rhs_cols; ++j)
         {
           for (int i = 0; i < lhs_rows; ++i)
diff --git a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
index f7e39248c..93cb21e0b 100644
--- a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
@@ -56,17 +56,16 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shap
   const int size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
   for (int i = 0; i < size; i++)
   {
-    output_data[i] =
-        ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]),
-                                     params.float_activation_min, params.float_activation_max);
+    output_data[i] = ActivationFunctionWithMinMax(
+      fn(input1_data[i], input2_data[i]), params.float_activation_min, params.float_activation_max);
   }
 }
 
 template <typename T>
 inline void BroadcastBinaryArithmeticOpSlowQuant8(
-    const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
-    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data,
-    const std::function<T(const BinaryArithmeticOpParam &params, const T &, const T &)> &fn)
+  const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
+  const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data,
+  const std::function<T(const BinaryArithmeticOpParam &params, const T &, const T &)> &fn)
 {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
@@ -100,10 +99,10 @@ inline void BroadcastBinaryArithmeticOpSlowQuant8(
         for (int c = 0; c < extended_output_shape.Dims(3); ++c)
         {
           output_data[Offset(extended_output_shape, b, y, x, c)] =
-              ActivationFunctionWithMinMax<uint8_t>(
-                  fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-                     input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
-                  params.quantized_activation_min, params.quantized_activation_max);
+            ActivationFunctionWithMinMax<uint8_t>(
+              fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+                 input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+              params.quantized_activation_min, params.quantized_activation_max);
         }
       }
     }
@@ -143,9 +142,9 @@ inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam &param
         for (int c = 0; c < extended_output_shape.Dims(3); ++c)
         {
           output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax<T>(
-              fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-                 input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
-              params.quantized_activation_min, params.quantized_activation_max);
+            fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+               input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+            params.quantized_activation_min, params.quantized_activation_max);
         }
       }
     }
@@ -154,9 +153,9 @@ inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam &param
 
 template <>
 inline void BroadcastBinaryArithmeticOpSlow(
-    const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data,
-    const Shape &input2_shape, const float *input2_data, const Shape &output_shape,
-    float *output_data, const std::function<float(const float &, const float &)> &fn)
+  const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data,
+  const Shape &input2_shape, const float *input2_data, const Shape &output_shape,
+  float *output_data, const std::function<float(const float &, const float &)> &fn)
 {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
@@ -171,10 +170,10 @@ inline void BroadcastBinaryArithmeticOpSlow(
       {
         for (int c = 0; c < extended_output_shape.Dims(3); ++c)
         {
-          output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax(
-              fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-                 input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
-              params.float_activation_min, params.float_activation_max);
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+            ActivationFunctionWithMinMax(fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+                                            input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+                                         params.float_activation_min, params.float_activation_max);
         }
       }
     }
diff --git a/compute/cker/include/cker/operation/reference/Conv.h b/compute/cker/include/cker/operation/reference/Conv.h
index 86e8b5143..43a5bf256 100644
--- a/compute/cker/include/cker/operation/reference/Conv.h
+++ b/compute/cker/include/cker/operation/reference/Conv.h
@@ -98,8 +98,8 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const float
             bias_value = bias_data[out_channel];
           }
           output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
-              ActivationFunctionWithMinMax(total + bias_value, output_activation_min,
-                                           output_activation_max);
+            ActivationFunctionWithMinMax(total + bias_value, output_activation_min,
+                                         output_activation_max);
         }
       }
     }
@@ -183,7 +183,7 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
           acc = std::max(acc, output_activation_min);
           acc = std::min(acc, output_activation_max);
           output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
-              static_cast<uint8_t>(acc);
+            static_cast<uint8_t>(acc);
         }
       }
     }
diff --git a/compute/cker/include/cker/ruy/RuySupport.h b/compute/cker/include/cker/ruy/RuySupport.h
index 7b4ff2040..62eeaf6bd 100644
--- a/compute/cker/include/cker/ruy/RuySupport.h
+++ b/compute/cker/include/cker/ruy/RuySupport.h
@@ -52,7 +52,7 @@ void MakeRuyMatrix(const MatrixParams<Scalar> &params, DataPointer data_ptr,
                    ruy::Matrix<Scalar> *dst, bool use_caching = false)
 {
   ruy::Order ruy_order =
-      params.order == Order::kColMajor ? ruy::Order::kColMajor : ruy::Order::kRowMajor;
+    params.order == Order::kColMajor ? ruy::Order::kColMajor : ruy::Order::kRowMajor;
   ruy::MakeSimpleLayout(params.rows, params.cols, ruy_order, dst->mutable_layout());
   // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer.
   // It does care whether we assign to it a Scalar* or a const Scalar*.
diff --git a/compute/ruy/CMakeLists.txt b/compute/ruy/CMakeLists.txt
new file mode 100644
index 000000000..d98ee1cd6
--- /dev/null
+++ b/compute/ruy/CMakeLists.txt
@@ -0,0 +1,11 @@
+nnfw_find_package(Ruy REQUIRED)
+
+add_library(nnfw_lib_ruy INTERFACE)
+target_link_libraries(nnfw_lib_ruy INTERFACE ruy)
+target_link_libraries(nnfw_lib_ruy INTERFACE ruy_instrumentation)
+target_compile_definitions(nnfw_lib_ruy INTERFACE USE_RUY_GEMV)
+if(PROFILE_RUY)
+  target_link_libraries(nnfw_lib_ruy INTERFACE ruy_profiler)
+endif(PROFILE_RUY)
+
+target_include_directories(nnfw_lib_ruy INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
diff --git a/compute/ruy/include/ruy/NeonTensorUtils.h b/compute/ruy/include/ruy/NeonTensorUtils.h
new file mode 100644
index 000000000..fb8b0a363
--- /dev/null
+++ b/compute/ruy/include/ruy/NeonTensorUtils.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_NEON_TENSOR_UTILS_H__
+#define __NNFW_RUY_NEON_TENSOR_UTILS_H__
+
+#include "ruy/neon/neon_check.h"
+
+#ifdef USE_NEON
+
+#define kFloatWeightsPerNeonLane 4
+
+namespace nnfw
+{
+namespace ruy
+{
+
+inline bool NeonIsZeroVector(const float *vector, int v_size)
+{
+  // If v_size is not divisible by kFloatWeightsPerNeonLane, we cannot
+  // use the main vectorized loop, and we need to process sequentially.
+  // postamble_start shows the start index where this should happen.
+  const int postamble_start = v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+
+  const float32x4_t zero_x4_float = vmovq_n_f32(0.0f);
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane)
+  {
+    const float32x4_t i_x4_float = vld1q_f32(vector + v);
+    uint32x4_t cmp_result = vceqq_f32(i_x4_float, zero_x4_float);
+    if (vgetq_lane_u32(cmp_result, 0) == 0)
+      return false;
+    if (vgetq_lane_u32(cmp_result, 1) == 0)
+      return false;
+    if (vgetq_lane_u32(cmp_result, 2) == 0)
+      return false;
+    if (vgetq_lane_u32(cmp_result, 3) == 0)
+      return false;
+  }
+
+  // Postamble loop
+  for (int v = postamble_start; v < v_size; ++v)
+  {
+    if (vector[v] != 0.0)
+      return false;
+  }
+  return true;
+}
+
+} // namespace ruy
+} // namespace nnfw
+
+#endif // USE_NEON
+
+#endif // __NNFW_RUY_NEON_TENSOR_UTILS_H__
diff --git a/compute/ruy/include/ruy/PortableTensorUtils.h b/compute/ruy/include/ruy/PortableTensorUtils.h
new file mode 100644
index 000000000..2d2c36cb2
--- /dev/null
+++ b/compute/ruy/include/ruy/PortableTensorUtils.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_PORTABLE_TENSOR_UTILS_H__
+#define __NNFW_RUY_PORTABLE_TENSOR_UTILS_H__
+
+namespace nnfw
+{
+namespace ruy
+{
+
+inline bool PortableIsZeroVector(const float *vector, int v_size)
+{
+  for (int i = 0; i < v_size; ++i)
+  {
+    if (*vector++ != 0.0f)
+      return false;
+  }
+  return true;
+}
+
+} // namespace ruy
+} // namespace nnfw
+
+#endif // __NNFW_RUY_PORTABLE_TENSOR_UTILS_H__
diff --git a/compute/ruy/include/ruy/RuySupport.h b/compute/ruy/include/ruy/RuySupport.h
new file mode 100644
index 000000000..7086a96c4
--- /dev/null
+++ b/compute/ruy/include/ruy/RuySupport.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_RUY_SUPPORT_H__
+#define __NNFW_RUY_RUY_SUPPORT_H__
+
+#include <util/ConfigSource.h>
+#include <ruy/matrix.h>
+#include <ruy/ruy.h>
+#include <cassert>
+#include "Types.h"
+
+namespace nnfw
+{
+namespace ruy
+{
+namespace ruy_support
+{
+
+inline ::ruy::CachePolicy ToRuyCachePolicy(CachePolicy cache_policy)
+{
+  switch (cache_policy)
+  {
+    case CachePolicy::kNeverCache:
+      return ::ruy::CachePolicy::kNeverCache;
+    case CachePolicy::kCacheIfLargeSpeedup:
+      return ::ruy::CachePolicy::kCacheIfLargeSpeedup;
+    case CachePolicy::kAlwaysCache:
+      return ::ruy::CachePolicy::kAlwaysCache;
+    default:
+      assert(false);
+      return ::ruy::CachePolicy::kNeverCache;
+  }
+}
+
+template <typename Scalar, typename DataPointer>
+void MakeRuyMatrix(const MatrixParams<Scalar> &params, DataPointer data_ptr,
+                   ::ruy::Matrix<Scalar> *dst, bool use_caching = false)
+{
+  ::ruy::Order ruy_order =
+    params.order == Order::kColMajor ? ::ruy::Order::kColMajor : ::ruy::Order::kRowMajor;
+  ::ruy::MakeSimpleLayout(params.rows, params.cols, ruy_order, dst->mutable_layout());
+  // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer.
+  // It does care whether we assign to it a Scalar* or a const Scalar*.
+  dst->set_data(data_ptr);
+  dst->set_zero_point(params.zero_point);
+  if (use_caching)
+  {
+    dst->set_cache_policy(ToRuyCachePolicy(params.cache_policy));
+  }
+}
+
+template <typename GemmParamsType, typename RuySpecType>
+void MakeRuyMulParams(const GemmParamsType &params, RuySpecType *ruy_mul_params)
+{
+  // This validation has already been performed by the Gemm API entry point,
+  // but it doesn't hurt to test specifically this again here, where it's
+  // being used.
+  ValidateGemmParams(params);
+
+  ruy_mul_params->set_multiplier_fixedpoint(params.multiplier_fixedpoint);
+  ruy_mul_params->set_multiplier_exponent(params.multiplier_exponent);
+  ruy_mul_params->set_multiplier_fixedpoint_perchannel(params.multiplier_fixedpoint_perchannel);
+  ruy_mul_params->set_multiplier_exponent_perchannel(params.multiplier_exponent_perchannel);
+  ruy_mul_params->set_bias(params.bias);
+  ruy_mul_params->set_clamp_min(params.clamp_min);
+  ruy_mul_params->set_clamp_max(params.clamp_max);
+}
+
+} // namespace ruy_support
+} // namespace ruy
+} // namespace nnfw
+
+#endif // __NNFW_RUY_RUY_SUPPORT_H__
diff --git a/compute/ruy/include/ruy/Shape.h b/compute/ruy/include/ruy/Shape.h
new file mode 100644
index 000000000..981c5b4de
--- /dev/null
+++ b/compute/ruy/include/ruy/Shape.h
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_SHAPE_H__
+#define __NNFW_RUY_SHAPE_H__
+
+#include <algorithm>
+#include <cstring>
+#include <cassert>
+#include <vector>
+
+#define UNUSED_RELEASE(a) (void)(a)
+
+namespace nnfw
+{
+namespace ruy
+{
+
+class Shape
+{
+public:
+  // Shapes with dimensions up to 5 are stored directly in the structure, while
+  // larger shapes are separately allocated.
+  static constexpr int kMaxSmallSize = 5;
+
+  Shape &operator=(Shape const &) = delete;
+
+  Shape() : _size(0) {}
+
+  explicit Shape(int dimensions_count) : _size(dimensions_count)
+  {
+    if (dimensions_count > kMaxSmallSize)
+    {
+      _dims_pointer = new int32_t[dimensions_count];
+    }
+  }
+
+  Shape(int shape_size, int32_t value) : _size(0)
+  {
+    Resize(shape_size);
+    for (int i = 0; i < shape_size; ++i)
+    {
+      SetDim(i, value);
+    }
+  }
+
+  Shape(int dimensions_count, const int32_t *dims_data) : _size(0)
+  {
+    ReplaceWith(dimensions_count, dims_data);
+  }
+
+  Shape(const std::initializer_list<int> init_list) : _size(0) { BuildFrom(init_list); }
+
+  // Avoid using this constructor.  We should be able to delete it when C++17
+  // rolls out.
+  Shape(Shape const &other) : _size(other.DimensionsCount())
+  {
+    if (_size > kMaxSmallSize)
+    {
+      _dims_pointer = new int32_t[_size];
+    }
+    std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * _size);
+  }
+
+  bool operator==(const Shape &comp) const
+  {
+    return this->_size == comp._size &&
+           std::memcmp(DimsData(), comp.DimsData(), _size * sizeof(int32_t)) == 0;
+  }
+
+  ~Shape()
+  {
+    if (_size > kMaxSmallSize)
+    {
+      delete[] _dims_pointer;
+    }
+  }
+
+  inline int32_t DimensionsCount() const { return _size; }
+  inline int32_t Dims(int i) const
+  {
+    assert(i >= 0);
+    assert(i < _size);
+    return _size > kMaxSmallSize ? _dims_pointer[i] : _dims[i];
+  }
+  inline void SetDim(int i, int32_t val)
+  {
+    assert(i >= 0);
+    assert(i < _size);
+    if (_size > kMaxSmallSize)
+    {
+      _dims_pointer[i] = val;
+    }
+    else
+    {
+      _dims[i] = val;
+    }
+  }
+
+  inline int32_t *DimsData() { return _size > kMaxSmallSize ? _dims_pointer : _dims; }
+  inline const int32_t *DimsData() const { return _size > kMaxSmallSize ? _dims_pointer : _dims; }
+  // The caller must ensure that the shape is no bigger than 4-D.
+  inline const int32_t *DimsDataUpTo4D() const { return _dims; }
+
+  inline void Resize(int dimensions_count)
+  {
+    if (_size > kMaxSmallSize)
+    {
+      delete[] _dims_pointer;
+    }
+    _size = dimensions_count;
+    if (dimensions_count > kMaxSmallSize)
+    {
+      _dims_pointer = new int32_t[dimensions_count];
+    }
+  }
+
+  inline void ReplaceWith(int dimensions_count, const int32_t *dims_data)
+  {
+    Resize(dimensions_count);
+    int32_t *dst_dims = DimsData();
+    std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
+  }
+
+  inline void ReplaceWith(const Shape &other)
+  {
+    ReplaceWith(other.DimensionsCount(), other.DimsData());
+  }
+
+  inline void ReplaceWith(Shape &&other)
+  {
+    Resize(0);
+    std::swap(_size, other._size);
+    if (_size <= kMaxSmallSize)
+      std::copy(other._dims, other._dims + kMaxSmallSize, _dims);
+    else
+      _dims_pointer = other._dims_pointer;
+  }
+
+  template <typename T> inline void BuildFrom(const T &src_iterable)
+  {
+    const int dimensions_count = std::distance(src_iterable.begin(), src_iterable.end());
+    Resize(dimensions_count);
+    int32_t *data = DimsData();
+    for (auto it : src_iterable)
+    {
+      *data = it;
+      ++data;
+    }
+  }
+
+  // This will probably be factored out. Old code made substantial use of 4-D
+  // shapes, and so this function is used to extend smaller shapes. Note that
+  // (a) as Dims<4>-dependent code is eliminated, the reliance on this should be
+  // reduced, and (b) some kernels are stricly 4-D, but then the shapes of their
+  // inputs should already be 4-D, so this function should not be needed.
+  inline static Shape ExtendedShape(int new_shape_size, const Shape &shape)
+  {
+    return Shape(new_shape_size, shape, 1);
+  }
+
+  inline void BuildFrom(const std::initializer_list<int> init_list)
+  {
+    BuildFrom<const std::initializer_list<int>>(init_list);
+  }
+
+  // Returns the total count of elements, that is the size when flattened into a
+  // vector.
+  inline int FlatSize() const
+  {
+    int buffer_size = 1;
+    const int *dims_data = DimsData();
+    for (int i = 0; i < _size; i++)
+    {
+      const int dim = dims_data[i];
+      assert(dim >= 1);
+      buffer_size *= dim;
+    }
+    return buffer_size;
+  }
+
+  bool operator!=(const Shape &comp) const { return !((*this) == comp); }
+
+private:
+  // For use only by ExtendedShape(), written to guarantee (return-value) copy
+  // elision in C++17.
+  // This creates a shape padded to the desired size with the specified value.
+  Shape(int new_shape_size, const Shape &shape, int pad_value) : _size(0)
+  {
+    assert(new_shape_size >= shape.DimensionsCount());
+    assert(new_shape_size <= kMaxSmallSize);
+    Resize(new_shape_size);
+    const int size_increase = new_shape_size - shape.DimensionsCount();
+    for (int i = 0; i < size_increase; ++i)
+    {
+      SetDim(i, pad_value);
+    }
+    std::memcpy(DimsData() + size_increase, shape.DimsData(),
+                sizeof(int32_t) * shape.DimensionsCount());
+  }
+
+  int32_t _size;
+  union {
+    int32_t _dims[kMaxSmallSize];
+    int32_t *_dims_pointer{nullptr};
+  };
+};
+
+inline int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
+{
+  UNUSED_RELEASE(shape2);
+  UNUSED_RELEASE(index2);
+  assert(shape1.Dims(index1) == shape2.Dims(index2));
+  return shape1.Dims(index1);
+}
+
+template <typename... Args>
+int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2, Args... args)
+{
+  assert(shape1.Dims(index1) == shape2.Dims(index2));
+  UNUSED_RELEASE(shape2);
+  UNUSED_RELEASE(index2);
+  return MatchingDim(shape1, index1, args...);
+}
+
+inline Shape GetShape(const std::vector<int32_t> &data) { return Shape(data.size(), data.data()); }
+
+inline int Offset(const Shape &shape, int i0, int i1, int i2, int i3)
+{
+  assert(shape.DimensionsCount() == 4);
+  const int *dims_data = shape.DimsDataUpTo4D();
+  assert(i0 >= 0 && i0 < dims_data[0]);
+  assert(i1 >= 0 && i1 < dims_data[1]);
+  assert(i2 >= 0 && i2 < dims_data[2]);
+  assert(i3 >= 0 && i3 < dims_data[3]);
+  return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
+}
+
+inline int Offset(const Shape &shape, int *index)
+{
+  return Offset(shape, index[0], index[1], index[2], index[3]);
+}
+
+inline int FlatSizeSkipDim(const Shape &shape, int skip_dim)
+{
+  const int dims_count = shape.DimensionsCount();
+  assert(skip_dim >= 0 && skip_dim < dims_count);
+  const auto *dims_data = shape.DimsData();
+  int flat_size = 1;
+  for (int i = 0; i < dims_count; ++i)
+  {
+    flat_size *= (i == skip_dim) ? 1 : dims_data[i];
+  }
+  return flat_size;
+}
+
+// Flat size calculation, checking that dimensions match with one or more other
+// arrays.
+template <typename... Ts> inline bool checkMatching(const Shape &shape, Ts... check_shapes)
+{
+  const Shape check_shapes_array[sizeof...(Ts)] = {std::forward<Ts>(check_shapes)...};
+  for (const auto &check_shape : check_shapes_array)
+  {
+    // Check matching of shapes except the case of that two shapes can be scalar
+    if (shape.DimensionsCount() > 1 || check_shape.DimensionsCount() > 1 || shape.FlatSize() != 1 ||
+        check_shape.FlatSize() != 1)
+    {
+      if (shape.DimensionsCount() != check_shape.DimensionsCount())
+      {
+        return false;
+      }
+      for (int i = 0; i < shape.DimensionsCount(); ++i)
+      {
+        if (shape.Dims(i) != check_shape.Dims(i))
+        {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+struct UNUSED_ALL
+{
+  template <typename... Args> UNUSED_ALL(Args const &...) {}
+};
+template <typename... Ts> inline int MatchingFlatSize(const Shape &shape, Ts... check_shapes)
+{
+  UNUSED_ALL{check_shapes...};
+  assert(checkMatching(shape, std::forward<Ts>(check_shapes)...));
+  return shape.FlatSize();
+}
+
+inline int MatchingFlatSizeSkipDim(const Shape &shape, int skip_dim, const Shape &check_shape_0)
+{
+  UNUSED_RELEASE(check_shape_0);
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i)
+  {
+    if (i != skip_dim)
+    {
+      assert(shape.Dims(i) == check_shape_0.Dims(i));
+    }
+  }
+  return FlatSizeSkipDim(shape, skip_dim);
+}
+
+inline int MatchingFlatSizeSkipDim(const Shape &shape, int skip_dim, const Shape &check_shape_0,
+                                   const Shape &check_shape_1)
+{
+  UNUSED_RELEASE(check_shape_0);
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i)
+  {
+    if (i != skip_dim)
+    {
+      assert(shape.Dims(i) == check_shape_0.Dims(i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1);
+}
+
+inline int MatchingElementsSize(const Shape &shape, const Shape &check_shape_0,
+                                const Shape &check_shape_1)
+{
+  const int size_1 = shape.FlatSize();
+  const int size_2 = check_shape_0.FlatSize();
+  const int size_3 = check_shape_1.FlatSize();
+  assert(size_1 == size_2);
+  assert(size_2 == size_3);
+  UNUSED_RELEASE(size_2);
+  UNUSED_RELEASE(size_3);
+  return size_1;
+}
+
+} // namespace ruy
+} // namespace nnfw
+
+#endif // __NNFW_RUY_SHAPE_H__
diff --git a/compute/ruy/include/ruy/TensorUtils.h b/compute/ruy/include/ruy/TensorUtils.h
new file mode 100644
index 000000000..149037cc9
--- /dev/null
+++ b/compute/ruy/include/ruy/TensorUtils.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_TENSOR_UTILS_H__
+#define __NNFW_RUY_TENSOR_UTILS_H__
+
+#include "ruy/PortableTensorUtils.h"
+#include "ruy/NeonTensorUtils.h"
+
+namespace nnfw
+{
+namespace ruy
+{
+
+inline bool IsZeroVector(const float *vector, int v_size)
+{
+  return NEON_OR_PORTABLE(IsZeroVector, vector, v_size);
+}
+
+} // namespace ruy
+} // namespace nnfw
+
+#endif // __NNFW_RUY_TENSOR_UTILS_H__
diff --git a/compute/ruy/include/ruy/Types.h b/compute/ruy/include/ruy/Types.h
new file mode 100644
index 000000000..b19b59735
--- /dev/null
+++ b/compute/ruy/include/ruy/Types.h
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_TYPES_H__
+#define __NNFW_RUY_TYPES_H__
+
+#include <cassert>
+#include <cstdint>
+#include <type_traits>
+#include <limits>
+#include <string>
+#include "Shape.h"
+
+namespace nnfw
+{
+namespace ruy
+{
+
+enum class FusedActivationFunctionType
+{
+  kNone = 0,
+  kRelu6 = 1,
+  kRelu1 = 2,
+  kRelu = 3,
+  kTanh = 4,
+  kSigmoid = 6,
+};
+
+enum class PaddingType
+{
+  kNone = 0,
+  kSame = 1,
+  kValid = 2,
+};
+
+struct PaddingValues
+{
+  int16_t width;
+  int16_t height;
+};
+
+struct ConvParams
+{
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  // TODO(starka): This was just "stride", so check that width+height is OK.
+  int16_t stride_width;
+  int16_t stride_height;
+  int16_t dilation_width_factor;
+  int16_t dilation_height_factor;
+  // uint8_t inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_shift;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+  bool is_replaced_weights{false};
+};
+
+struct FullyConnectedParams
+{
+  FusedActivationFunctionType activation{FusedActivationFunctionType::kNone};
+  // uint8 inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32_t input_offset;
+  int32_t weights_offset;
+  float weights_scale;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_shift;
+  // uint8, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
+  // float activation params - no one use this params, but ruy might use them later.
+  float float_activation_min;
+  float float_activation_max;
+  // Mark the operands as cacheable if they are unchanging, e.g. weights.
+  bool lhs_cacheable;
+  bool rhs_cacheable;
+  // FullyConnectedWeightsFormat weights_format;
+};
+
+enum class Order
+{
+  kColMajor,
+  kRowMajor
+};
+
+enum class CachePolicy : std::uint8_t
+{
+  kNeverCache,
+  kCacheIfLargeSpeedup,
+  kAlwaysCache,
+};
+
+// MatrixParams encapsulates the parameters that Gemm needs about each
+// matrix, besides the buffer data pointer.
+// Compare to ruy::Matrix, which also encapsulates the data pointer.
+// Rationale for leaving the data pointer out of here: doing so
+// requires complicated const-correctness mechanics. See
+// ruy::ConstCheckingPtr.
+template <typename Scalar> struct MatrixParams
+{
+  // Storage layout order. For now we only do plain linear non-strided
+  // layout. It would be easy to support a stride if needed.
+  Order order = Order::kColMajor;
+  // Number of rows of the matrix.
+  int rows = 0;
+  // Number of columns of the matrix.
+  int cols = 0;
+  // The zero_point, i.e. which Scalar value is to be interpreted as zero.
+  // When Scalar is floating-point, this must be 0.
+  Scalar zero_point = 0;
+  // When the data pointed to by this matrix is constant data, so that it is
+  // valid to assume that equality of pointers implies equality of data,
+  // a CachePolicy may be used instead of the default kNeverCache,
+  // which will enable ruy to take advantage of this constancy of the data to
+  // cache the packing work, which can be a large speedup in matrix*vector
+  // and other narrow shapes.
+  CachePolicy cache_policy = CachePolicy::kNeverCache;
+};
+
+// Enumeration of broad categories of Gemm.
+//
+// The primary reason for this to exist is to allow Gemm to compile
+// only uniform-quantized or only per-channel-quantized code paths.
+// This is unneeded with ruy as the back-end, as this is only a runtime
+// difference in ruy, but with gemmlowp these really are separate code
+// paths and templatizing in a QuantizationFlavor is necessary to avoid
+// compiling unused gemmlowp code. Indeed, TFLite currently uses
+// uint8 with uniform quantization and int8 with per-channel quantization,
+// and does not use uint8 with per-channel. We want to avoid compiling
+// the gemmlowp uint8 per-channel path when gemmlowp is the back-end.
+//
+// It's possible to drop this in the future if gemmlowp goes away and no
+// other then-relevant backend library handles quantized paths in a way that
+// requires knowing this at compile-time.
+enum class QuantizationFlavor
+{
+  // Floating-point Gemm: the accumulators are not multiplied by any
+  // 'multiplier'.
+  kFloatingPoint,
+  // Quantized Gemm using a single multiplier for all accumulators.
+  kIntegerWithUniformMultiplier,
+  // Quantized Gemm using a separate multipliers for accumulators of each
+  // row of the destination matrix. This is what is called 'per-channel'
+  // in GemmParams. Here we use the more specific 'per-row' terminology
+  // to allow for the possibility of 'per-column' in the future, and to
+  // allow for that to be a separate code path in some back-end such as
+  // gemmlowp.
+  kIntegerWithPerRowMultiplier
+};
+
+// Additional parameters that Gemm needs, beyond what falls into
+// the MatrixParams that it takes. Compare to ruy::Spec.
+//
+// Decoupling AccumScalar from DstScalar (rather than deducing it from that)
+// is useful future-proofing. Think of a float16 path using float32 accum.
+//
+// QuantizationFlavor is passed here even though it's technically not used
+// in this class. This is so that we retain the ability in the future to
+// specialize this class for quantization flavor, and this allows for
+// Gemm to be templatized in quantization_flavor via the GemmParams that it
+// takes, allowing for automatic template parameter deduction to take place,
+// so that most call sites don't need to specify a QuantizationFlavor
+// (only those that need perchannel quantization do).
+template <typename AccumScalar, typename DstScalar,
+          QuantizationFlavor quantization_flavor =
+            std::is_floating_point<AccumScalar>::value
+              ? QuantizationFlavor::kFloatingPoint
+              : QuantizationFlavor::kIntegerWithUniformMultiplier>
+struct GemmParams
+{
+  // Only for non-floating-point cases. The fixed-point part (i.e. the mantissa)
+  // of the multiplier by which accumulators are multiplied before being casted
+  // to the destination type.
+  AccumScalar multiplier_fixedpoint = 0;
+  // Only for non-floating-point cases. The exponent part of the aforementioned
+  // multiplier.
+  int multiplier_exponent = 0;
+  // Per-channel variant of multiplier_fixedpoint. If not nullptr, this must
+  // point to a buffer of as many values as there are rows in the destination
+  // matrix. Each row of the destination matrix will use the corresponding
+  // buffer element instead of multiplier_fixedpoint.
+  const AccumScalar *multiplier_fixedpoint_perchannel = nullptr;
+  // Per-channel variant of multiplier_exponent. If not nullptr, this must
+  // point to a buffer of as many values as there are rows in the destination
+  // matrix. Each row of the destination matrix will use the corresponding
+  // buffer element instead of multiplier_exponent.
+  //
+  // Either none or both of multiplier_exponent_perchannel and
+  // multiplier_fixedpoint_perchannel must be nullptr.
+  const int *multiplier_exponent_perchannel = nullptr;
+  // The bias vector data, if not null.
+  const AccumScalar *bias = nullptr;
+  // min clamp bound of destination values.
+  DstScalar clamp_min = std::is_floating_point<DstScalar>::value
+                          ? -std::numeric_limits<DstScalar>::infinity()
+                          : std::numeric_limits<DstScalar>::lowest();
+  // max clamp bound of destination values.
+  DstScalar clamp_max = std::is_floating_point<DstScalar>::value
+                          ? std::numeric_limits<DstScalar>::infinity()
+                          : std::numeric_limits<DstScalar>::max();
+};
+
+// Validates self-consistency of GemmParams.
+template <typename AccumScalar, typename DstScalar, QuantizationFlavor quantization_flavor>
+void ValidateGemmParams(const GemmParams<AccumScalar, DstScalar, quantization_flavor> &params)
+{
+  // Guard consistency of the quantized multiplier fields.
+  if (quantization_flavor == QuantizationFlavor::kFloatingPoint)
+  {
+    assert(!params.multiplier_fixedpoint);
+    assert(!params.multiplier_exponent);
+    assert(!params.multiplier_fixedpoint_perchannel);
+    assert(!params.multiplier_exponent_perchannel);
+  }
+  else if (quantization_flavor == QuantizationFlavor::kIntegerWithUniformMultiplier &&
+           !std::is_same<DstScalar, int32_t>::value)
+  {
+    assert(params.multiplier_fixedpoint);
+    // Nothing to check about multiplier_exponent
+    assert(!params.multiplier_fixedpoint_perchannel);
+    assert(!params.multiplier_exponent_perchannel);
+  }
+  else if (quantization_flavor == QuantizationFlavor::kIntegerWithPerRowMultiplier &&
+           !std::is_same<DstScalar, int32_t>::value)
+  {
+    assert(!params.multiplier_fixedpoint);
+    assert(!params.multiplier_exponent);
+    assert(params.multiplier_fixedpoint_perchannel);
+    assert(params.multiplier_exponent_perchannel);
+  }
+  else
+  {
+    // For the get raw accumulator case, we should make sure none of the
+    // quantization params are set.
+    assert(!params.multiplier_fixedpoint);
+    assert(!params.multiplier_exponent);
+    assert(!params.multiplier_fixedpoint_perchannel);
+    assert(!params.multiplier_exponent_perchannel);
+  }
+  UNUSED_RELEASE(params);
+}
+
+inline CachePolicy DefaultCachePolicy(bool is_constant_data)
+{
+  return is_constant_data ? CachePolicy::kCacheIfLargeSpeedup : CachePolicy::kNeverCache;
+}
+
+} // namespace ruy
+} // namespace nnfw
+
+#endif // __NNFW_RUY_TYPES_H__
diff --git a/compute/ruy/include/ruy/Utils.h b/compute/ruy/include/ruy/Utils.h
new file mode 100644
index 000000000..50205abe5
--- /dev/null
+++ b/compute/ruy/include/ruy/Utils.h
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_UTILS_H__
+#define __NNFW_RUY_UTILS_H__
+
+#include "Types.h"
+#include "Shape.h"
+
+#include <stdexcept>
+
+namespace nnfw
+{
+namespace ruy
+{
+template <typename T>
+inline void ExtractPatchIntoBufferColumn(const Shape &input_shape, int w, int h, int b, int kheight,
+                                         int kwidth, int stride_width, int stride_height,
+                                         int pad_width, int pad_height, int in_width, int in_height,
+                                         int in_depth, int single_buffer_length, int buffer_id,
+                                         const T *in_data, T *conv_buffer_data, uint8_t zero_byte)
+{
+  assert(input_shape.DimensionsCount() == 4);
+  // This chunk of code reshapes all the inputs corresponding to
+  // output (b, h, w) to a column vector in conv_buffer(:, buffer_id).
+  const int kwidth_times_indepth = kwidth * in_depth;
+  const int inwidth_times_indepth = in_width * in_depth;
+  const int ih_ungated_start = h * stride_height - pad_height;
+  const int ih_ungated_end = (ih_ungated_start + kheight);
+  const int ih_end = std::min(ih_ungated_end, in_height);
+  const int iw_ungated_start = w * stride_width - pad_width;
+  const int iw_ungated_end = (iw_ungated_start + kwidth);
+  const int iw_end = std::min(iw_ungated_end, in_width);
+  // If the patch is off the edge of the input image, skip writing those rows
+  // and columns from the patch into the output array.
+  const int h_offset = std::max(0, -ih_ungated_start);
+  const int w_offset = std::max(0, -iw_ungated_start);
+  const int ih_start = std::max(0, ih_ungated_start);
+  const int iw_start = std::max(0, iw_ungated_start);
+  const int single_row_num = std::min(kwidth - w_offset, in_width - iw_start) * in_depth;
+  const int output_row_offset = (buffer_id * single_buffer_length);
+  int out_offset = output_row_offset + (h_offset * kwidth + w_offset) * in_depth;
+  int in_offset = Offset(input_shape, b, ih_start, iw_start, 0);
+
+  // Express all of the calculations as padding around the input patch.
+  const int top_padding = h_offset;
+  const int bottom_padding = (ih_ungated_end - ih_end);
+  const int left_padding = w_offset;
+  const int right_padding = (iw_ungated_end - iw_end);
+  assert(single_row_num == ((kwidth - (left_padding + right_padding)) * in_depth));
+
+  // Write out zeroes to the elements representing the top rows of the input
+  // patch that are off the edge of the input image.
+  if (top_padding > 0)
+  {
+    const int top_row_elements = (top_padding * kwidth * in_depth);
+    memset(conv_buffer_data + output_row_offset, zero_byte, (top_row_elements * sizeof(T)));
+  }
+
+  // If the patch is on the interior of the input image horizontally, just copy
+  // over the rows sequentially, otherwise add zero padding at the start or end.
+  if ((left_padding == 0) && (right_padding == 0))
+  {
+    for (int ih = ih_start; ih < ih_end; ++ih)
+    {
+      memcpy(conv_buffer_data + out_offset, in_data + in_offset, single_row_num * sizeof(T));
+      out_offset += kwidth_times_indepth;
+      in_offset += inwidth_times_indepth;
+    }
+  }
+  else
+  {
+    for (int ih = ih_start; ih < ih_end; ++ih)
+    {
+      if (left_padding > 0)
+      {
+        const int left_start = (out_offset - (left_padding * in_depth));
+        memset(conv_buffer_data + left_start, zero_byte, (left_padding * in_depth * sizeof(T)));
+      }
+      memcpy(conv_buffer_data + out_offset, in_data + in_offset, single_row_num * sizeof(T));
+      if (right_padding > 0)
+      {
+        const int right_start = (out_offset + single_row_num);
+        memset(conv_buffer_data + right_start, zero_byte, (right_padding * in_depth * sizeof(T)));
+      }
+      out_offset += kwidth_times_indepth;
+      in_offset += inwidth_times_indepth;
+    }
+  }
+
+  // If the bottom of the patch falls off the input image, pad the values
+  // representing those input rows with zeroes.
+  if (bottom_padding > 0)
+  {
+    const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
+    const int bottom_start =
+      output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
+    memset(conv_buffer_data + bottom_start, zero_byte, (bottom_row_elements * sizeof(T)));
+  }
+}
+
+// Supports per-batch zero_byte for per-batch asymmetric quantized inputs.
+template <typename T>
+void DilatedIm2col(const ConvParams &params, const Shape &input_shape, const T *input_data,
+                   const Shape &filter_shape, const Shape &output_shape, T *im2col_data,
+                   const int32_t *zero_bytes, const int zero_bytes_len)
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  // For dilated convolution, the input pixels are not contiguous therefore we
+  // can't use the same optimizations as Im2Col(). Though note this code would
+  // work fine for the non-dilated case too (though likely a bit slower).
+  assert(dilation_width_factor != 1 || dilation_height_factor != 1);
+  assert(im2col_data);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  MatchingDim(output_shape, 3, filter_shape, 0);
+
+  // Construct the MxN sized im2col matrix.
+  // The rows M, are sub-ordered B x H x W
+  const Shape row_shape({1, batches, output_height, output_width});
+  // The columns, N, are sub-ordered Kh x Kw x Din
+  const Shape col_shape({1, filter_height, filter_width, input_depth});
+  // Use dimensions M and N to construct dims for indexing directly into im2col
+  const Shape im2col_shape({1, 1, row_shape.FlatSize(), col_shape.FlatSize()});
+
+  // Loop through the output rows (B x H x W)
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    const T zero_byte =
+      zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]);
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        // Each im2col row is an output pixel. Arrange the input data in this
+        // row in an order we can conveniently multiply with the filter data.
+        int row_offset = Offset(row_shape, 0, batch, out_y, out_x);
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        const int in_y_origin = (out_y * stride_height) - pad_height;
+        // Loop through all the pixels of the filter (Kh x Kw)
+        for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+        {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          if ((in_y >= 0) && (in_y < input_height))
+          {
+            // Filter row is within the input data.
+            // Loop through all the filter pixels in this row.
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+              int col_offset = Offset(col_shape, 0, filter_y, filter_x, 0);
+              T *dst = im2col_data + Offset(im2col_shape, 0, 0, row_offset, col_offset);
+              if ((in_x >= 0) && (in_x < input_width))
+              {
+                // Filter pixel is within the input, copy the input data.
+                T const *src = input_data + Offset(input_shape, batch, in_y, in_x, 0);
+                memcpy(dst, src, input_depth * sizeof(T));
+              }
+              else
+              {
+                // Filter pixel is outside the input, zero it out.
+                memset(dst, zero_byte, input_depth * sizeof(T));
+              }
+            }
+          }
+          else
+          {
+            // Filter row is outside the input, zero out the entire filter row.
+            int col_offset = Offset(col_shape, 0, filter_y, 0, 0);
+            T *dst = im2col_data + Offset(im2col_shape, 0, 0, row_offset, col_offset);
+            memset(dst, zero_byte, filter_width * input_depth * sizeof(T));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void DilatedIm2col(const ConvParams &params, uint8_t zero_byte, const Shape &input_shape,
+                   const T *input_data, const Shape &filter_shape, const Shape &output_shape,
+                   T *im2col_data)
+{
+  const int32_t zero_point = static_cast<int32_t>(zero_byte);
+  DilatedIm2col<T>(params, input_shape, input_data, filter_shape, output_shape, im2col_data,
+                   &zero_point, 1);
+}
+
+template <typename T>
+void Im2col(const ConvParams &params, int kheight, int kwidth, uint8_t zero_byte,
+            const Shape &input_shape, const T *input_data, const Shape &output_shape,
+            T *output_data)
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  assert(input_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = input_shape.Dims(3);
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+  const int output_depth = output_shape.Dims(3);
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+
+  int buffer_id = 0;
+  // Loop over the output nodes.
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int h = 0; h < output_height; ++h)
+    {
+      for (int w = 0; w < output_width; ++w)
+      {
+        ExtractPatchIntoBufferColumn(input_shape, w, h, b, kheight, kwidth, stride_width,
+                                     stride_height, pad_width, pad_height, input_width,
+                                     input_height, input_depth, output_depth, buffer_id, input_data,
+                                     output_data, zero_byte);
+        ++buffer_id;
+      }
+    }
+  }
+}
+
+} // namespace ruy
+} // namespace nnfw
+
+#endif // __NNFW_RUY_UTILS_H__
diff --git a/compute/ruy/include/ruy/neon/neon_check.h b/compute/ruy/include/ruy/neon/neon_check.h
new file mode 100644
index 000000000..08394f26f
--- /dev/null
+++ b/compute/ruy/include/ruy/neon/neon_check.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_NEON_CHECK_H__
+#define __NNFW_RUY_NEON_CHECK_H__
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#include <arm_neon.h>
+#endif
+
+// Disable X86_NEON
+// #if defined __GNUC__ && defined __SSE4_1__ && !defined TF_LITE_DISABLE_X86_NEON
+#if 0
+#define USE_NEON
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wattributes"
+#pragma GCC diagnostic ignored "-Wnarrowing"
+#pragma GCC diagnostic ignored "-Wsequence-point"
+#include "NEON_2_SSE.h"
+#pragma GCC diagnostic pop
+#endif
+
+// NEON_OR_PORTABLE(SomeFunc, args) calls NeonSomeFunc(args) if USE_NEON is
+// defined, PortableSomeFunc(args) otherwise.
+#ifdef USE_NEON
+// Always use Neon code
+#define NEON_OR_PORTABLE(funcname, ...) Neon##funcname(__VA_ARGS__)
+
+#else
+// No NEON available: Use Portable code
+#define NEON_OR_PORTABLE(funcname, ...) Portable##funcname(__VA_ARGS__)
+
+#endif // defined(USE_NEON)
+
+#endif // __NNFW_RUY_NEON_CHECK_H__
diff --git a/compute/ruy/include/ruy/operation/Conv.h b/compute/ruy/include/ruy/operation/Conv.h
new file mode 100644
index 000000000..2b9c8c390
--- /dev/null
+++ b/compute/ruy/include/ruy/operation/Conv.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_CONV_H__
+#define __NNFW_RUY_CONV_H__
+
+#include "ruy/Types.h"
+#include "ruy/Shape.h"
+#include "ruy/Utils.h"
+#include "ruy/RuySupport.h"
+
+#include <ruy/ruy.h>
+#include <ruy/context.h>
+#include <iostream>
+#include <vector>
+
+namespace nnfw
+{
+namespace ruy
+{
+
+class Conv
+{
+public:
+  Conv() : _im2col_shape(4), _need_im2col(false), _prepared(false) {}
+
+  void prepare(const Shape &input_shape, const Shape &kernel_shape, const Shape &output_shape,
+               uint32_t stride_width, uint32_t stride_height, uint32_t dilation_width_factor,
+               uint32_t dilation_height_factor)
+  {
+    if (!_prepared)
+    {
+      IsRequiredIm2col(input_shape, kernel_shape, output_shape, stride_width, stride_height,
+                       dilation_width_factor, dilation_height_factor);
+      _prepared = true;
+    }
+  }
+
+  void operator()(const ConvParams &params, const Shape &input_shape, const float *input_data,
+                  const Shape &filter_shape, const float *filter_data, const Shape &bias_shape,
+                  const float *bias_data, const Shape &output_shape, float *output_data,
+                  ::ruy::Context *ruy_context)
+  {
+    if (!_prepared)
+    {
+      // This means that input or output are dynamic or filter is not constant
+      IsRequiredIm2col(input_shape, filter_shape, output_shape, params.stride_width,
+                       params.stride_height, params.dilation_width_factor,
+                       params.dilation_height_factor);
+      _prepared = true;
+    }
+
+    int im2col_size = _need_im2col ? _im2col_shape.FlatSize() : 0;
+
+    // Use heap if size is larger than 8MB
+    if (im2col_size > 2 * 1024 * 1024)
+    {
+      std::unique_ptr<float[]> im2col_data = std::make_unique<float[]>(im2col_size);
+      ConvFloat(params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
+                output_shape, output_data, _im2col_shape, im2col_data.get(), ruy_context);
+    }
+    else if (im2col_size > 0)
+    {
+      float im2col_data[im2col_size];
+      ConvFloat(params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
+                output_shape, output_data, _im2col_shape, im2col_data, ruy_context);
+    }
+    else
+    {
+      ConvFloat(params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
+                output_shape, output_data, _im2col_shape, nullptr, ruy_context);
+    }
+  }
+
+private:
+  void ConvFloat(const ConvParams &params, const Shape &input_shape, const float *input_data,
+                 const Shape &filter_shape, const float *filter_data, const Shape &bias_shape,
+                 const float *bias_data, const Shape &output_shape, float *output_data,
+                 const Shape &im2col_shape, float *im2col_data, ::ruy::Context *ruy_context)
+  {
+    UNUSED_RELEASE(bias_shape);
+    const int stride_width = params.stride_width;
+    const int stride_height = params.stride_height;
+    const int dilation_width_factor = params.dilation_width_factor;
+    const int dilation_height_factor = params.dilation_height_factor;
+    const float output_activation_min = params.float_activation_min;
+    const float output_activation_max = params.float_activation_max;
+    assert(input_shape.DimensionsCount() == 4);
+    assert(filter_shape.DimensionsCount() == 4);
+    assert(output_shape.DimensionsCount() == 4);
+
+    // NB: the float 0.0f value is represented by all zero bytes.
+    const uint8_t float_zero_byte = 0x00;
+    const float *gemm_input_data = nullptr;
+    const Shape *gemm_input_shape = nullptr;
+    const int filter_width = filter_shape.Dims(2);
+    const int filter_height = filter_shape.Dims(1);
+    const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1;
+    const bool need_im2col =
+      stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1;
+    if (need_dilated_im2col)
+    {
+      DilatedIm2col(params, float_zero_byte, input_shape, input_data, filter_shape, output_shape,
+                    im2col_data);
+      gemm_input_data = im2col_data;
+      gemm_input_shape = &im2col_shape;
+    }
+    else if (need_im2col)
+    {
+      assert(im2col_data);
+      Im2col(params, filter_height, filter_width, float_zero_byte, input_shape, input_data,
+             im2col_shape, im2col_data);
+      gemm_input_data = im2col_data;
+      gemm_input_shape = &im2col_shape;
+    }
+    else
+    {
+      // TODO(aselle): We need to make sure to not send im2col if it is not
+      // needed.
+      assert(!im2col_data);
+      gemm_input_data = input_data;
+      gemm_input_shape = &input_shape;
+    }
+
+    const int gemm_input_dims = gemm_input_shape->DimensionsCount();
+    int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
+    int n = output_shape.Dims(3);
+    int k = gemm_input_shape->Dims(gemm_input_dims - 1);
+
+    // When an optimized CBLAS implementation is not available, fall back
+    // to using cpu_backend_gemm.
+    MatrixParams<float> lhs_params;
+    lhs_params.order = Order::kRowMajor;
+    lhs_params.rows = n;
+    lhs_params.cols = k;
+    MatrixParams<float> rhs_params;
+    rhs_params.order = Order::kColMajor;
+    rhs_params.rows = k;
+    rhs_params.cols = m;
+    MatrixParams<float> dst_params;
+    dst_params.order = Order::kColMajor;
+    dst_params.rows = n;
+    dst_params.cols = m;
+    GemmParams<float, float> gemm_params;
+    gemm_params.bias = bias_data;
+    gemm_params.clamp_min = output_activation_min;
+    gemm_params.clamp_max = output_activation_max;
+
+    // Below code is from tflite::cpu_backend_gemm::detail::GemmImplUsingRuy
+    ::ruy::Matrix<float> ruy_lhs;
+    ::ruy::Matrix<float> ruy_rhs;
+    ::ruy::Matrix<float> ruy_dst;
+    // Note that cache is always enabled for input and weight tensors
+    ruy_support::MakeRuyMatrix(lhs_params, filter_data, &ruy_lhs, true);
+    ruy_support::MakeRuyMatrix(rhs_params, gemm_input_data, &ruy_rhs, true);
+    ruy_support::MakeRuyMatrix(dst_params, output_data, &ruy_dst);
+
+    ::ruy::BasicSpec<float, float> ruy_mul_params;
+    ruy_support::MakeRuyMulParams(gemm_params, &ruy_mul_params);
+
+    ::ruy::Mul(ruy_lhs, ruy_rhs, ruy_mul_params, ruy_context, &ruy_dst);
+  }
+
+  void IsRequiredIm2col(const Shape &input_shape, const Shape &kernel_shape,
+                        const Shape &output_shape, uint32_t stride_width, uint32_t stride_height,
+                        uint32_t dilation_width_factor, uint32_t dilation_height_factor)
+  {
+    const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1;
+    const bool need_non_dilated_im2col = stride_width != 1 || stride_height != 1 ||
+                                         kernel_shape.Dims(1) != 1 || kernel_shape.Dims(2) != 1;
+
+    _need_im2col = need_dilated_im2col || need_non_dilated_im2col;
+
+    if (_need_im2col)
+    {
+      _im2col_shape.SetDim(0, output_shape.Dims(0));
+      _im2col_shape.SetDim(1, output_shape.Dims(1));
+      _im2col_shape.SetDim(2, output_shape.Dims(2));
+      _im2col_shape.SetDim(3, input_shape.Dims(3) * kernel_shape.Dims(1) * kernel_shape.Dims(2));
+    }
+  }
+
+private:
+  Shape _im2col_shape;
+  bool _need_im2col;
+  bool _prepared;
+};
+} // namespace ruy
+} // namespace nnfw
+
+#endif // __NNFW_RUY_CONV_H_
diff --git a/compute/ruy/include/ruy/operation/FullyConnected.h b/compute/ruy/include/ruy/operation/FullyConnected.h
new file mode 100644
index 000000000..59facdb22
--- /dev/null
+++ b/compute/ruy/include/ruy/operation/FullyConnected.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RUY_FULLY_CONNECTED_H__
+#define __NNFW_RUY_FULLY_CONNECTED_H__
+
+#include "ruy/Shape.h"
+#include "ruy/Types.h"
+#include "ruy/Utils.h"
+#include "ruy/RuySupport.h"
+
+#include <ruy/ruy.h>
+#include <ruy/context.h>
+
+namespace nnfw
+{
+namespace ruy
+{
+
+inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
+                           const float *input_data, const Shape &weights_shape,
+                           const float *weights_data, const Shape &,
+                           const float *optional_bias_data, const Shape &output_shape,
+                           float *output_data, ::ruy::Context *ruy_context)
+{
+  const int dims_count = weights_shape.DimensionsCount();
+  const int input_rows = weights_shape.Dims(dims_count - 1);
+  MatrixParams<float> rhs_params;
+  rhs_params.order = Order::kColMajor;
+  rhs_params.rows = input_rows;
+  rhs_params.cols = input_shape.FlatSize() / input_rows;
+  rhs_params.cache_policy = DefaultCachePolicy(params.rhs_cacheable);
+  assert(input_shape.FlatSize() == (rhs_params.rows * rhs_params.cols));
+  MatrixParams<float> lhs_params;
+  lhs_params.order = Order::kRowMajor;
+  lhs_params.cols = weights_shape.Dims(dims_count - 1);
+  lhs_params.rows = FlatSizeSkipDim(weights_shape, dims_count - 1);
+  lhs_params.cache_policy = DefaultCachePolicy(params.lhs_cacheable);
+  MatrixParams<float> dst_params;
+  dst_params.order = Order::kColMajor;
+  dst_params.rows = output_shape.Dims(output_shape.DimensionsCount() - 1);
+  dst_params.cols = FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1);
+  GemmParams<float, float> gemm_params;
+  gemm_params.bias = optional_bias_data;
+  gemm_params.clamp_min = params.float_activation_min;
+  gemm_params.clamp_max = params.float_activation_max;
+
+  // Below code was copied from tflite::cpu_backend_gemm::detail::GemmImplUsingRuy
+  ::ruy::Matrix<float> ruy_lhs;
+  ::ruy::Matrix<float> ruy_rhs;
+  ::ruy::Matrix<float> ruy_dst;
+  // Note that cache is always enabled for input and weight tensors
+  ruy_support::MakeRuyMatrix(lhs_params, weights_data, &ruy_lhs, true);
+  ruy_support::MakeRuyMatrix(rhs_params, input_data, &ruy_rhs, true);
+  ruy_support::MakeRuyMatrix(dst_params, output_data, &ruy_dst);
+
+  ::ruy::BasicSpec<float, float> ruy_mul_params;
+  ruy_support::MakeRuyMulParams(gemm_params, &ruy_mul_params);
+
+  ::ruy::Mul(ruy_lhs, ruy_rhs, ruy_mul_params, ruy_context, &ruy_dst);
+}
+
+} // namespace ruy
+} // namespace nnfw
+
+#endif // __NNFW_RUY_FULLY_CONNECTED_H__
diff --git a/compute/test/cker/Range.cc b/compute/test/cker/Range.cc
index 55f4fcf20..e5fe4801f 100644
--- a/compute/test/cker/Range.cc
+++ b/compute/test/cker/Range.cc
@@ -48,9 +48,7 @@ TEST(CKer_Operation, Range)
     const float start = 3;
     const float limit = 1;
     const float delta = -0.5;
-    std::vector<float> expected = {
-        3, 2.5, 2, 1.5,
-    };
+    std::vector<float> expected = {3, 2.5, 2, 1.5};
     std::vector<float> actual(expected.size());
     nnfw::cker::Range<float>(&start, &limit, &delta, actual.data());
author	Chunseok Lee <chunseok.lee@samsung.com>	2020-12-14 14:43:43 +0900
committer	Chunseok Lee <chunseok.lee@samsung.com>	2020-12-14 14:43:43 +0900
commit	62529acabbafce7730601ed01d5709d7bc0d378a (patch)
tree	bf6912cfa8fac4a2997292bfcb3c82055734c97e /compute
parent	6ea13af5257155ff993c205cf997b870cc627f73 (diff)
download	nnfw-62529acabbafce7730601ed01d5709d7bc0d378a.tar.gz nnfw-62529acabbafce7730601ed01d5709d7bc0d378a.tar.bz2 nnfw-62529acabbafce7730601ed01d5709d7bc0d378a.zip