23 files changed, 2111 insertions, 0 deletions
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
new file mode 100644
index 000000000..37bccc52c
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_NEFUNCTIONSEX_H__
+#define __ARM_COMPUTE_NEFUNCTIONSEX_H__
+
+#include <arm_compute/runtime/NEON/functions/NEArgMinMax.h>
+#include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h>
+#include <arm_compute/runtime/NEON/functions/NECast.h>
+#include <arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEGatherEx.h>
+#include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h>
+#include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NEPReLU.h>
+#include <arm_compute/runtime/NEON/functions/NEReduceMeanEx.h>
+#include <arm_compute/runtime/NEON/functions/NEReduceSum.h>
+#include <arm_compute/runtime/NEON/functions/NERNNLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NEReduceOperation.h>
+#include <arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h>
+#include <arm_compute/runtime/NEON/functions/NETransposeConvLayer.h>
+
+#endif // __ARM_COMPUTE_NEFUNCTIONSEX_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEArgMinMax.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEArgMinMax.h
new file mode 100644
index 000000000..604cd93c4
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEArgMinMax.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEON_ARG_MIN_MAX_H__
+#define __ARM_COMPUTE_NEON_ARG_MIN_MAX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform reduce min/max operation */
+template <ReductionOperation op> class NEArgMinMaxStatic : public IFunction
+{
+public:
+  /** Constructor */
+  NEArgMinMaxStatic(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Configure kernel
+   *
+   * @note Supported tensor rank: up to 4
+   *
+   * @param[in]  input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in]  axis           Reduction axis.
+   * @param[out] output         Destination tensor. Data type supported: Same as @p input
+   */
+  void configure(ITensor *input, int axis, ITensor *output);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref NEArgMinMax
+   *
+   * @param[in] input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in] axis Reduction axis.
+   * @param[in] output         Destination tensor. Data type supported: Same as @p input
+   *
+   * @return A status
+   */
+  static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;
+  NEArgMinMaxLayer _reduction_kernel;
+  Tensor _reduced_out;
+  NEReshapeLayer _reshape;
+};
+
+/** Basic function to run arg max. */
+using NEArgMax = NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MAX>;
+/** Basic function to run arg min. */
+using NEArgMin = NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MIN>;
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEON_ARG_MIN_MAX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
new file mode 100644
index 000000000..2a624656d
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
+#define __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
+
+#include "arm_compute/core/TypesEx.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEBinaryLogicalOperationKernel.
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/U8.
+ * @note The function performs a binary logical operation between two tensors.
+ */
+class NEBinaryLogicalOperation : public INESimpleFunction
+{
+public:
+  /** Initialise the kernel's inputs, output and conversion policy.
+   *
+   * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8.
+   * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
+   * @param[out]     output Output tensor. Data types supported: Same as @p input1.
+   * @param[in]      op     Binary Logical Operation to be performed.
+   */
+  void configure(ITensor *input1, ITensor *input2, ITensor *output, BinaryLogicalOperation op);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEBinaryLogicalOperationKernel
+   *
+   * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8.
+   * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+   * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+   * @param[in] op     Binary Logical Operation to be performed.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                         const ITensorInfo *output, BinaryLogicalOperation op);
+};
+
+/** Basic function to run @ref NEBinaryLogicalOperationKernel
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/U8.
+ * @note The function performs a binary logical operation between two tensors.
+ */
+template <BinaryLogicalOperation op> class NEBinaryLogicalOperationStatic : public INESimpleFunction
+{
+public:
+  /** Initialise the kernel's inputs, output and conversion policy.
+   *
+   * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8
+   * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
+   * @param[out]     output Output tensor. Data types supported: Same as @p input1.
+   */
+  void configure(ITensor *input1, ITensor *input2, ITensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEBinaryLogicalOperationKernel
+   *
+   * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8
+   * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+   * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                         const ITensorInfo *output);
+};
+
+/** Basic function to run equal comparison. */
+using NELogicalAnd = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>;
+/** Basic function to run not equal comparison. */
+using NELogicalOr = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>;
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h
new file mode 100644
index 000000000..ae2f57f19
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECAST_H__
+#define __ARM_COMPUTE_NECAST_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Basic function to run @ref NECastKernel that converts an input tensor to the other types */
+class NECast : public INESimpleFunctionNoBorder
+{
+public:
+  /** Configure the kernel.
+   *
+   * @param[in]  input  Source tensor. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+   * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
+   * U8/S8/QASYMM8/U32/S32/F32.
+   * @param[in]  input_subtype  Sub data type of input.
+   */
+  void configure(const ITensor *input, ITensor *output,
+                 SubDataType input_subtype = SubDataType::NONE);
+  /** Static function to check if given info will lead to a valid configuration of @ref NECast
+   *
+   * @param[in] input  Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
+   * @param[in] output Output tensor info. Data type supported: U8/S8/QASYMM8/U32/S32/F32.
+   * @param[in] input_subtype  Sub data type of input.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         SubDataType input_subtype = SubDataType::NONE);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NECAST_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h
new file mode 100644
index 000000000..90c0751b8
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__
+#define __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEDepthToSpaceLayerKernelEx. */
+class NEDepthToSpaceLayerEx : public INESimpleFunctionNoBorder
+{
+public:
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[out] output      Tensor output. Data types supported: same as @p input
+   * @param[in]  block_shape Block shape value.
+   */
+  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEDepthToSpaceLayerEx.
+   *
+   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] output      Tensor output info. Data types supported: same as @p input
+   * @param[in] block_shape Block shape x value.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h
new file mode 100644
index 000000000..f0c8ecdb5
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__
+#define __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform negative on an input tensor. */
+class NENegLayer : public INESimpleFunction
+{
+public:
+  /** Initialize the function
+   *
+   * @param[in]  input  Input tensor. Data types supported: F16/F32/S32.
+   * @param[out] output Output tensor. Data types supported: same as @p input.
+   */
+  void configure(const ITensor *input, ITensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref NERsqrtLayer
+   *
+   * @param[in] input  First tensor input info. Data types supported: F16/F32/S32.
+   * @param[in] output Output tensor info. Data types supported: Same as @p input.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
new file mode 100644
index 000000000..0646f1668
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file NEEmbeddingLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::NEEmbeddingLookup class
+ */
+
+#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__
+#define __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ITensor;
+
+/**
+ * @brief Class to perform EmbeddingLookup operation
+ */
+class NEEmbeddingLookup : public INESimpleFunctionNoBorder
+{
+public:
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   * input.
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   * input. Data types supported: S32.
+   * @return N/A
+   */
+  void configure(const ITensor *input, ITensor *output, const ITensor *lookups);
+  /** Static function to check if given info will lead to a valid configuration of @ref NECopy
+   *
+   * @param[in] input  Source tensor info. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] output Output tensor info. Data types supported: Same as @p input.
+   * @param[in] output Lookups tensor info. Data types supported: S32.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *lookups);
+};
+}
+#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
new file mode 100644
index 000000000..42a786821
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__
+#define __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__
+
+#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls
+ * the following kernels:
+ *
+ *  -# @ref NETransposeKernel
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class NEFullyConnectedHybridLayerReshapeWeights : public INESimpleFunctionNoBorder
+{
+public:
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported:
+   * QASYMM8/F16/F32.
+   * @param[out] output Destination tensor. Data type supported: Same as @p input.
+   */
+  void configure(const ITensor *input, ITensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEFullyConnectedHybridLayerReshapeWeights
+   *
+   * @param[in] input  Weights tensor info. The weights must be 2 dimensional. Data types supported:
+   * QASYMM8/F16/F32.
+   * @param[in] output Destination tensor info. Data type supported: Same as @p input.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+
+/** Basic function to compute a Fully Connected layer on NEON. This function calls the following
+ * NEON kernels:
+ *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref NEFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false
+ * and transpose_weights is set to true ) (called once)
+ *  -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized
+ * asymmetric)
+ *  -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref
+ * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
+ * not equal to nullptr)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class NEFullyConnectedHybridLayer : public IFunction
+{
+public:
+  /** Constructor */
+  NEFullyConnectedHybridLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEFullyConnectedHybridLayer(const NEFullyConnectedHybridLayer &) = delete;
+  /** Default move constructor */
+  NEFullyConnectedHybridLayer(NEFullyConnectedHybridLayer &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEFullyConnectedHybridLayer &operator=(const NEFullyConnectedHybridLayer &) = delete;
+  /** Default move assignment operator */
+  NEFullyConnectedHybridLayer &operator=(NEFullyConnectedHybridLayer &&) = default;
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input   Source tensor. Data type supported: F16/F32.
+   * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
+   *                     If this function is called after a Convolution Layer, the (transposed)
+   * weights will have as many rows as the product of the first 3 input's dimensions.
+   *                     If it is called after another FullyConnected Layer, the (transposed)
+   * weights will have as many rows as the input's first dimension.
+   *                     Data type supported: S8.
+   * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported:Same as @p input.
+   * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix
+   * multiplication between:
+   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
+   * function is called after a Convolution Layer
+   *                     - The input tensor and the (transposed) 2D weights, if the function is
+   * called after another FullyConnected Layer.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  fc_info (Optional) Fully connected layer additional info
+   */
+  void configure(const ITensor *input, const ITensor *weights, const ITensor *biases,
+                 ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEFullyConnectedHybridLayer
+   *
+   * @param[in]  input   Source tensor info. Data type supported: F16/F32.
+   * @param[in]  weights Weights tensor info. The weights must be 2 dimensional.
+   *                     If this function is called after a Convolution Layer, the (transposed)
+   * weights will have as many rows as the product of the first 3 input's dimensions.
+   *                     If it is called after another FullyConnected Layer, the (transposed)
+   * weights will have as many rows as the input's first dimension.
+   *                     Data type supported: S8.
+   * @param[in]  biases  Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
+   * @param[out] output  Destination tensor info. Its shape should be equal to the output of a
+   * matrix multiplication between:
+   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
+   * function is called after a Convolution Layer
+   *                     - The input tensor and the (transposed) 2D weights, if the function is
+   * called after another FullyConnected Layer.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  fc_info (Optional) Fully connected layer additional info
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *biases, const ITensorInfo *output,
+                         FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+
+  // Inherited methods override
+  void run() override;
+  void prepare() override;
+
+private:
+  void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
+
+  MemoryGroup _memory_group;
+  NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function;
+  NEQuantizationSymmetricKernel _quant_input_kernel;
+  NEGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp;
+  NEMultiplyScaleFactorKernel _multiply_scale_kernel;
+  NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
+  Tensor _reshape_weights_output;
+  Tensor _quantized_input;
+  Tensor _scale_factor;
+  Tensor _gemmlowp_output;
+  const ITensor *_original_weights;
+  bool _are_weights_reshaped;
+  bool _accumulate_biases;
+  bool _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
new file mode 100644
index 000000000..6bd67f322
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__
+#define __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Basic function to compute a Fully Connected layer on NEON. This function calls the following
+ * NEON kernels:
+ *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and
+ * transpose_weights is set to true ) (called once)
+ *  -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized
+ * asymmetric)
+ *  -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref
+ * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
+ * not equal to nullptr)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ * @note  The difference from NEFullyConnectedLayer is that this class supports weights as input
+ * with performance loss.
+ */
+class NEFullyConnectedLayerEx : public IFunction
+{
+public:
+  /** Constructor */
+  NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEFullyConnectedLayerEx(const NEFullyConnectedLayerEx &) = delete;
+  /** Default move constructor */
+  NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEFullyConnectedLayerEx &operator=(const NEFullyConnectedLayerEx &) = delete;
+  /** Default move assignment operator */
+  NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = default;
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input   Source tensor. Data type supported: QASYMM8/F16/F32.
+   * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
+   *                     If this function is called after a Convolution Layer, the (transposed)
+   * weights will have as many rows as the product of the first 3 input's dimensions.
+   *                     If it is called after another FullyConnected Layer, the (transposed)
+   * weights will have as many rows as the input's first dimension.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported:Same as @p input.
+   * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix
+   * multiplication between:
+   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
+   * function is called after a Convolution Layer
+   *                     - The input tensor and the (transposed) 2D weights, if the function is
+   * called after another FullyConnected Layer.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  fc_info (Optional) Fully connected layer additional info
+   */
+  void configure(const ITensor *input, const ITensor *weights, const ITensor *biases,
+                 ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEFullyConnectedLayerEx
+   *
+   * @param[in]  input   Source tensor info. Data type supported: QASYMM8/F16/F32.
+   * @param[in]  weights Weights tensor info. The weights must be 2 dimensional.
+   *                     If this function is called after a Convolution Layer, the (transposed)
+   * weights will have as many rows as the product of the first 3 input's dimensions.
+   *                     If it is called after another FullyConnected Layer, the (transposed)
+   * weights will have as many rows as the input's first dimension.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  biases  Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
+   * @param[out] output  Destination tensor info. Its shape should be equal to the output of a
+   * matrix multiplication between:
+   *                     - The output of im2col on the input and the (transposed) 2D weights, if the
+   * function is called after a Convolution Layer
+   *                     - The input tensor and the (transposed) 2D weights, if the function is
+   * called after another FullyConnected Layer.
+   *                     Data type supported: Same as @p input.
+   * @param[in]  fc_info (Optional) Fully connected layer additional info
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *biases, const ITensorInfo *output,
+                         FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+
+  // Inherited methods override
+  void run() override;
+  void prepare() override;
+
+private:
+  void configure_fc_fc(const ITensor *input, const ITensor *weights, ITensor *output);
+  void configure_conv_fc(const ITensor *input, const ITensor *weights, ITensor *output);
+  void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
+
+  MemoryGroup _memory_group;
+  NEFlattenLayerKernel _flatten_kernel;
+  NEConvertFullyConnectedWeights _convert_weights;
+  NEFullyConnectedLayerReshapeWeights _reshape_weights_function;
+  NEGEMM _mm_gemm;
+  NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
+  NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage;
+  NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
+  Tensor _flatten_output;
+  Tensor _gemmlowp_output;
+  Tensor _converted_weights_output;
+  Tensor _reshape_weights_output;
+  const ITensor *_original_weights;
+  bool _are_weights_converted;
+  bool _are_weights_reshaped;
+  bool _is_fc_after_conv;
+  bool _accumulate_biases;
+  bool _is_quantized;
+  bool _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
new file mode 100644
index 000000000..18cb61bf9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file        NEFullyConnectedReshapingLayer.h
+ * @brief       This file contains NEFullyConnectedReshapingLayer class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__
+#define __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__
+
+#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>
+#include <arm_compute/runtime/IMemoryManager.h>
+#include <arm_compute/runtime/Tensor.h>
+
+namespace arm_compute
+{
+/**
+ * @brief Class to run FullyConnected Layer after reshaping input tensor
+ */
+class NEFullyConnectedReshapingLayer : public arm_compute::IFunction
+{
+public:
+  enum class KernelType
+  {
+    GENERAL,             //< General FC
+    PREPROCESSED_WEIGHTS //< Weights are constants so it can be preprocessed
+  };
+
+public:
+  NEFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
+      : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr),
+        _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false)
+  {
+    // DO NOTHING
+  }
+
+public:
+  /**
+   * @brief Configure the layer
+   * @param[in] input The source tensor
+   * @param[in] weights The tensor that is filled with weight values
+   * @param[in] biases The tensor that is filled with biase values
+   * @param[in] output The destination tensor
+   * @param[in] needs_reshape Whether it needs to be reshaped or not
+   * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true.
+   * @param[in] kernel_type The kernel type for actual FullyConnected layer
+   * @return N/A
+   */
+  void configure(const arm_compute::ITensor *input, const arm_compute::ITensor *weights,
+                 const arm_compute::ITensor *biases, arm_compute::ITensor *output,
+                 bool needs_reshape, const arm_compute::TensorShape &reshape,
+                 KernelType kernel_type);
+
+public:
+  /**
+   * @brief Run the operation. Must be called after configure().
+   * @return N/A
+   */
+  void run(void) override;
+  /**
+   * @brief Prepare the operation
+   * @return N/A
+   */
+  void prepare(void) override;
+
+private:
+  std::shared_ptr<IMemoryManager> _memory_manager;
+  const arm_compute::ITensor *_input;
+  const arm_compute::ITensor *_weights;
+  const arm_compute::ITensor *_biases;
+  arm_compute::ITensor *_output;
+
+  // buffer for reshaping input tensor
+  arm_compute::Tensor _neon_buffer;
+
+private:
+  std::unique_ptr<arm_compute::IFunction> _neon_fc;
+  NEReshapeLayer _neon_reshape;
+  bool _needs_reshape;
+};
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h
new file mode 100644
index 000000000..414b9f7d9
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__
+#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+// #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute GEMMLowpMatrixMultiplyCore on NEON. This function calls the following
+ * NEON kernels if the DOT product instruction is not available:
+ *
+ *  -# @ref NEGEMMInterleave4x4Kernel
+ *  -# @ref NEGEMMTranspose1xWKernel
+ *  -# @ref NEGEMMLowpMatrixMultiplyKernel
+ *  -# @ref NEGEMMLowpOffsetContributionKernel
+ *  -# @ref NEActivationLayer
+ *
+ * otherwise if the DOT product instruction is available:
+ *
+ *  -# @ref NEGEMMLowpOffsetContributionKernel
+ *
+*/
+class NEGEMMLowpMatrixMultiplyCoreEx : public IFunction
+{
+public:
+  /** Constructor */
+  NEGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEGEMMLowpMatrixMultiplyCoreEx(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete;
+  /** Default move constructor */
+  NEGEMMLowpMatrixMultiplyCoreEx(NEGEMMLowpMatrixMultiplyCoreEx &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEGEMMLowpMatrixMultiplyCoreEx &operator=(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete;
+  /** Default move assignment operator */
+  NEGEMMLowpMatrixMultiplyCoreEx &operator=(NEGEMMLowpMatrixMultiplyCoreEx &&) = default;
+  /** Initialise the kernel's inputs, output
+   *
+   * @note GEMM_LOWP:  low precision GEMM kernel
+   *  This kernel performs the following computations:
+   *
+   *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
+   *  -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
+   *  -# Compute the matrix product of the resulting a * b in int32.
+   *
+   * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is
+   * QASYMM8/QASYMM8_SIGNED otherwise
+   *
+   * @param[in]  a         First input tensor  (Matrix A). Data type supported:
+   * QASYMM8/QASYMM8_SIGNED.
+   * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a
+   * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported:
+   * S32
+   * @param[out] output    Output tensor. Data type supported: Data type supported:
+   * S32/QASYMM8/QASYMM8_SIGNED
+   * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
+   * and
+   *                       if the reshape of matrix B should be executed only for the first run
+   */
+  void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output,
+                 const GEMMInfo &gemm_info = GEMMInfo());
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEGEMMLowpMatrixMultiplyCoreEx
+   *
+   * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is
+   * QASYMM8/QASYMM8_SIGNED otherwise
+   *
+   * @param[in] a         First input tensor info  (Matrix A). Data type supported:
+   * QASYMM8/QASYMM8_SIGNED.
+   * @param[in] b         Second input tensor info (Matrix B). Data type supported: same as @p a
+   * @param[in] c         Third input tensor  info (Matrix C). It can be a nullptr. Data type
+   * supported: S32
+   * @param[in] output    Output tensor info. Data type supported: Data type supported:
+   * S32/QASYMM8/QASYMM8_SIGNED
+   * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
+   * and
+   *                      if the reshape of matrix B should be executed only for the first run
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
+                         const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+
+  // Inherited methods overridden
+  void run() override;
+  void prepare() override;
+
+private:
+  MemoryGroup _memory_group;
+  NEGEMMAssemblyDispatch _asm_glue;
+  std::unique_ptr<INEKernel> _mm_kernel;
+  std::unique_ptr<INEKernel> _mtx_a_reshape_kernel;
+  std::unique_ptr<INEKernel> _mtx_b_reshape_kernel;
+  NEGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel;
+  NEGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel;
+  NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel;
+  NEGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel;
+  //    NEActivationLayer                             _activation_func;
+
+  Tensor _vector_sum_col;
+  Tensor _vector_sum_row;
+  Tensor _tmp_a;
+  Tensor _tmp_b;
+  Tensor _mm_result_s32;
+  Tensor _signed_a;
+  Tensor _signed_output;
+  const ITensor *_original_b;
+  int32_t _a_offset;
+  int32_t _b_offset;
+
+  bool _run_vector_matrix_multiplication;
+  bool _assembly_path;
+  bool _fused_assembly_path;
+  bool _reshape_b_only_on_first_run;
+  bool _is_prepared;
+  bool _fuse_output_stage;
+  bool _run_activation;
+  bool _flip_signedness;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
new file mode 100644
index 000000000..d95e6a81e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEGATHEREX_H__
+#define __ARM_COMPUTE_NEGATHEREX_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEGatherKernelEx */
+class NEGatherEx : public INESimpleFunctionNoBorder
+{
+public:
+  /** Initialise the kernel's inputs and outputs
+   *
+   * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported:
+   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in]  indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+   * @param[out] output  Destination tensor. Data type supported: Same as @p input
+   * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+   */
+  void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEGatherKernelEx
+   *
+   * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported:
+   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+   * @param[in] output  Destination tensor info. Data type supported: Same as @p input
+   * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
+                         const ITensorInfo *output, int axis);
+};
+
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_NEGATHEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
new file mode 100644
index 000000000..69abf0192
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file NEHashtableLookup.h
+ * @ingroup COM_AI_RUNTIME
+ * @brief This file contains arm_compute::NEHashtableLookup class
+ */
+
+#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUP_H__
+#define __ARM_COMPUTE_NEHASHTABLELOOKUP_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ITensor;
+
+/**
+ * @brief Class to perform HashtableLookup operation
+ */
+class NEHashtableLookup : public INESimpleFunctionNoBorder
+{
+public:
+  /**
+   * @brief Set the input and output tensors.
+   * @param[in]  lookups  Lookups 1D tensor that values are indices into the first dimension of
+   *                      input. Data types supported: S32
+   * @param[in]  keys     Keys 1D tensor. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    Source tensor.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[out] output   Destination tensor. Data types and data layouts supported: Same as @p
+   *                      input.
+   * @param[out] hits     Hits 1D tensor. A boolean tensor that indicates whether the lookup hits
+   *                      (True) or not (False). Data types supported: U8/QASYMM8
+   * @return N/A
+   */
+  void configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output,
+                 ITensor *hits);
+  /** Static function to check if given info will lead to a valid configuration of @ref NECopy
+   *
+   * @param[in]  lookups  Lookups 1D tensor info.
+   *                      Data types supported: S32
+   * @param[in]  keys     Keys 1D tensor info. keys and input pair represent a map.
+   *                      Data types supported: S32
+   * @param[in]  input    Source tensor info.
+   *                      Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in]  output   Destination tensor info. Data types and data layouts supported: Same as @p
+   * input.
+   * @param[in]  hits     Hits 1D tensor info. A boolean tensor that indicates whether the lookup
+   * hits (True) or not (False). Data types supported: U8/QASYMM8
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                         const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *hits);
+};
+}
+#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
new file mode 100644
index 000000000..521f50d2f
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__
+#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__
+
+#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform a Instance normalization.
+ *
+ * This function runs the following kernels:
+ * -# @ref NEInstanceNormalizationLayerKernelEx
+ */
+class NEInstanceNormalizationLayerEx : public IFunction
+{
+public:
+  /** Constructor */
+  NEInstanceNormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Set the input and output tensors.
+   *
+   * @param[in, out] input   Source tensor. In case of @p output tensor = nullptr this tensor will
+   * store the result of the normalization.
+   *                         Data types supported: F16/F32. Data layout supported: NHWC, NCHW
+   * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in]      gamma   (Optional) The scale scalar value applied to the normalized tensor.
+   * Defaults to 1.0
+   * @param[in]      beta    (Optional) The offset scalar value applied to the normalized tensor.
+   * Defaults to 0.0
+   * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   */
+  void configure(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta,
+                 float epsilon = 1e-12f);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEInstanceNormalizationLayer.
+   *
+   * @param[in] input   Source tensor info. Data types supported: F16/F32. Data layout supported:
+   * NHWC, NCHW
+   * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in] gamma   (Optional) The scale scalar value applied to the normalized tensor. Defaults
+   * to 1.0
+   * @param[in] beta    (Optional) The offset scalar value applied to the normalized tensor.
+   * Defaults to 0.0
+   * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr,
+                         float epsilon = 1e-12f);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;
+  NEInstanceNormalizationLayerKernelEx _normalization_kernel;
+  bool _is_nchw;
+  NEPermute _permute_input;
+  NEPermute _permute_output;
+  Tensor _permuted_input;
+  Tensor _permuted_output;
+};
+}
+#endif /* __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h
new file mode 100644
index 000000000..5664c57cb
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEPRELU_H__
+#define __ARM_COMPUTE_NEPRELU_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEPReLUKernel */
+class NEPReLU : public INESimpleFunctionNoBorder
+{
+public:
+  /** Initialise the kernel's inputs and output
+   *
+   * @param[in]  input. Data types supported: QASYMM8/F32.
+   * @param[in]  alpha. Data types supported: Same as @p input.
+   * @param[out] output Output tensor. Data types supported: Same as @p input.
+   */
+  void configure(const ITensor *input, const ITensor *alpha, ITensor *output);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEPRELU_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h
new file mode 100644
index 000000000..17c37d806
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NERNNLAYER_EX_H__
+#define __ARM_COMPUTE_NERNNLAYER_EX_H__
+
+#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Basic function to run @ref NERNNLayerEx */
+class NERNNLayerEx : public IFunction
+{
+public:
+  /** Default constructor */
+  NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NERNNLayerEx(const NERNNLayerEx &) = delete;
+  /** Default move constructor */
+  NERNNLayerEx(NERNNLayerEx &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NERNNLayerEx &operator=(const NERNNLayerEx &) = delete;
+  /** Default move assignment operator */
+  NERNNLayerEx &operator=(NERNNLayerEx &&) = default;
+  /** Initialize the function
+   *
+   * @param[in]     input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
+   * types supported: F16/F32
+   * @param[in]     weights           Weights tensor of shape [input_size, num_units] that
+   * multiplies the input. Data types supported: Same as @p input
+   * @param[in]     recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies
+   * the current 'state'. Data types supported: Same as @p input
+   * @param[in]     bias              Bias vector of shape [num_units]. Data types supported: Same
+   * as @p input
+   * @param[out]    output            Output tensor of shape [num_units, batch_size]. Data types
+   * supported: Same as @p input
+   * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
+   * supported: Same as @p input
+   * @param[in]     info              Activation layer parameter.
+   */
+  void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights,
+                 const ITensor *bias, ITensor *hidden_state, ITensor *output,
+                 ActivationLayerInfo &info);
+  /** Initialize the function
+   *
+   * @param[in] input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
+   * types supported: F16/F32
+   * @param[in] weights           Weights tensor of shape [input_size, num_units] that multiplies
+   * the input. Data types supported: Same as @p input
+   * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the
+   * current 'state'. Data types supported: Same as @p input
+   * @param[in] bias              Bias vector of shape [num_units]. Data types supported: Same as @p
+   * input
+   * @param[in] output            Output tensor of shape [num_units, batch_size]. Data types
+   * supported: Same as @p input
+   * @param[in] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
+   * supported: Same as @p input
+   * @param[in] info              Activation layer parameter.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+                         const ITensorInfo *hidden_state, const ITensorInfo *output,
+                         const ActivationLayerInfo &info);
+
+  // Inherited methods overridden:
+  void run() override;
+  void prepare() override;
+
+private:
+  MemoryGroup _memory_group;
+  NEGEMM _gemm_state_f;
+  NEArithmeticAdditionKernel _add_kernel;
+  NEActivationLayerKernel _activation_kernel;
+  NEFullyConnectedLayer _fully_connected_kernel;
+  NECopyKernel _copy_kernel;
+  Tensor _fully_connected_out;
+  Tensor _gemm_output;
+  Tensor _add_output;
+  bool _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NERNNLAYER_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
new file mode 100644
index 000000000..7209acf19
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
+#define __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform reduce operation */
+class NEReduceMeanEx : public IFunction
+{
+public:
+  /** Constructor */
+  NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Configure kernel
+   *
+   * @note Supported tensor rank: up to 4
+   *
+   * @param[in]  input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in]  reduction_axis Reduction axis vector.
+   * @param[in]  keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[out] output         Destination tensor. Data type supported: Same as @p input
+   */
+  void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+                 ITensor *output);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEReduceMeanEx
+   *
+   * @param[in] input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in] reduction_axis Reduction axis vector.
+   * @param[in] keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[in] output         Destination tensor. Data type supported: Same as @p input
+   *
+   * @return A status
+   */
+  static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                         bool keep_dims, const ITensorInfo *output);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;
+  std::unique_ptr<NEReductionOperation[]> _reduction_kernels{nullptr};
+  std::unique_ptr<Tensor[]> _reduced_outs{nullptr};
+  NEReshapeLayer _reshape;
+  unsigned int _reduction_ops;
+  bool _keep_dims;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
new file mode 100644
index 000000000..9c558e6a2
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__
+#define __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/TypesEx.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform reduce operation */
+class NEReduceOperation : public IFunction
+{
+public:
+  /** Constructor */
+  NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Configure kernel
+   *
+   * @note Supported tensor rank: up to 4
+   *
+   * @param[in]  input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in]  reduction_axis Reduction axis vector.
+   * @param[in]  keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[out] output         Destination tensor. Data type supported: Same as @p input
+   * @param[in]  op             Reduce operation to perform.
+   */
+  void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output,
+                 ReduceOperation op);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEReduceOperation
+   *
+   * @param[in] input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in] reduction_axis Reduction axis vector.
+   * @param[in] keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[in] output         Destination tensor. Data type supported: Same as @p input
+   * @param[in]  op             Reduce operation to perform.
+   *
+   * @return A status
+   */
+  static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                         bool keep_dims, const ITensorInfo *output, ReduceOperation op);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;
+  std::vector<NEReductionOperationEx> _reduction_kernels;
+  std::vector<Tensor> _reduced_outs;
+  NEReshapeLayer _reshape;
+  unsigned int _reduction_ops;
+  bool _keep_dims;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
new file mode 100644
index 000000000..c028ea658
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEON_REDUCE_SUM_H__
+#define __ARM_COMPUTE_NEON_REDUCE_SUM_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform reduce operation */
+class NEReduceSum : public IFunction
+{
+public:
+  /** Constructor */
+  NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Configure kernel
+   *
+   * @note Supported tensor rank: up to 4
+   *
+   * @param[in]  input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in]  reduction_axis Reduction axis vector.
+   * @param[in]  keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[out] output         Destination tensor. Data type supported: Same as @p input
+   */
+  void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+                 ITensor *output);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref NEReduceSum
+   *
+   * @param[in] input          Source tensor. Data type supported: QASYMM8/F16/F32
+   * @param[in] reduction_axis Reduction axis vector.
+   * @param[in] keep_dims      If positive, retains reduced dimensions with length 1.
+   * @param[in] output         Destination tensor. Data type supported: Same as @p input
+   *
+   * @return A status
+   */
+  static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                         bool keep_dims, const ITensorInfo *output);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  MemoryGroup _memory_group;
+  std::vector<NEReductionOperation> _reduction_kernels;
+  std::vector<Tensor> _reduced_outs;
+  NEReshapeLayer _reshape;
+  unsigned int _reduction_ops;
+  bool _keep_dims;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEON_REDUCE_SUM_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h
new file mode 100644
index 000000000..7180742df
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__
+#define __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h"
+#include "arm_compute/core/TypesEx.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to simulate a reduction operation. This function calls the following NEON
+ * kernels:
+ *
+ * -# @ref NEFillBorderKernel
+ * -# @ref NEReductionOperationKernelEx
+ *
+ */
+class NEReductionOperationEx : public IFunction
+{
+public:
+  /** Default constructor */
+  NEReductionOperationEx();
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input  Source tensor. Data type supported: QASYMM8/F16/F32.
+   * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input.
+   * @param[in]  axis   Dimension along which to reduce.
+   * @param[in]  op     Reduction operation to perform.
+   */
+  void configure(ITensor *input, ITensor *output, unsigned int axis, ReduceOperation op);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEReductionOperationEx.
+   *
+   * @param[in] input  Source tensor info. Data type supported: QASYMM8/F16/F32.
+   * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p
+   * input.
+   * @param[in] axis   Dimension along which to reduce.
+   * @param[in] op     Reduction operation to perform.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis,
+                         ReduceOperation op);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  NEReductionOperationKernelEx _reduction_kernel;
+  NEFillBorderKernel _fill_border_kernel;
+  size_t _window_split;
+  int _reduction_axis;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h
new file mode 100644
index 000000000..302f9af2e
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__
+#define __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
+#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to spatial divide a tensor. This function calls the following NEON
+ * kernels/functions:
+ *
+ *  -# @ref NEMemsetKernel
+ *  -# @ref NESpaceToBatchLayerKernel
+ */
+class NESpaceToBatchLayerEx : public IFunction
+{
+public:
+  /** Default constructor */
+  NESpaceToBatchLayerEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NESpaceToBatchLayerEx(const NESpaceToBatchLayerEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NESpaceToBatchLayerEx &operator=(const NESpaceToBatchLayerEx &) = delete;
+  /** Allow instances of this class to be moved */
+  NESpaceToBatchLayerEx(NESpaceToBatchLayerEx &&) = default;
+  /** Allow instances of this class to be moved */
+  NESpaceToBatchLayerEx &operator=(NESpaceToBatchLayerEx &&) = default;
+  /** Default destructor */
+  virtual ~NESpaceToBatchLayerEx() = default;
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
+   * @param[in]  paddings    2-D tensor with shape [2, M]. Data types supported: S32
+   * @param[out] output      Tensor output. Data types supported: same as @p input
+   */
+  void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings,
+                 ITensor *output);
+  /** Set the input and output tensors. (Static block shape and paddings)
+   *
+   * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in]  block_shape_x Block shape x value.
+   * @param[in]  block_shape_y Block shape y value.
+   * @param[in]  padding_left  The left padding of the output tensor.
+   * @param[in]  padding_right The right padding of the output tensor.
+   * @param[out] output        Tensor output. Data types supported: same as @p input
+   */
+  void configure(const ITensor *input, const int block_shape_x, const int block_shape_y,
+                 const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NESpaceToBatchLayerEx
+   *
+   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] block_shape block shape tensor info with shape [M]. Data types supported: S32
+   * @param[in] paddings    paddings tensor info with shape [2, M]. Data types supported: S32
+   * @param[in] output      Tensor output info. Data types supported: same as @p input
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape,
+                         const ITensorInfo *paddings, const ITensorInfo *output);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NESpaceToBatchLayerEx (Static block shape and paddings)
+   *
+   * @param[in] input         Tensor input info. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] block_shape_x Block shape x value.
+   * @param[in] block_shape_y Block shape y value.
+   * @param[in] padding_left  The left padding of the output tensor.
+   * @param[in] padding_right The right padding of the output tensor.
+   * @param[in] output        Tensor output info. Data types supported: same as @p input
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y,
+                         const Size2D &padding_left, const Size2D &padding_right,
+                         const ITensorInfo *output);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  NESpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
+  NEMemsetKernel _memset_kernel;                    /**< Memset kernel to run */
+  bool _has_padding;                                /**< Flag to check if the output has padding */
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h
new file mode 100644
index 000000000..117717b55
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__
+#define __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** This function calls the following NEON kernels/functions:
+ *
+ *  -# @ref NESpaceToDepthLayerKernelEx
+ */
+class NESpaceToDepthLayerEx : public INESimpleFunctionNoBorder
+{
+public:
+  /** Set the input and output tensors.
+   *
+   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[out] output      Tensor output. Data types supported: same as @p input
+   * @param[in]  block_shape Block shape value
+   */
+  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NESpaceToDepthLayerEx (Static block shape and paddings)
+   *
+   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
+   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+   * @param[in] output      Tensor output info. Data types supported: same as @p input
+   * @param[in] block_shape Block shape value
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
new file mode 100644
index 000000000..a50b9ea60
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
+#define __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
+
+#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+
+#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Function to run the deconvolution layer.
+ *
+ * Transpose convolution Layer is the backward pass of Convolution Layer. First we transform the
+ * input depending on the stride and pad info and then perfrom a 1x1
+ * convolution pass. Input stride defines how many zeroes we should put between each element of the
+ * input, pad is the amount of padding and finaly a is a user
+ * specified value where a < stride - 1 that increases the padding top and right of the input image.
+ *
+ *  The relation between input to output is as follows:
+ *  \f[
+ *       width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x
+ *  \f]
+ *  \f[
+ *       height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y
+ *  \f]
+ *
+ *  where
+ *      width is the size of the first input dimension.
+ *      height is the size of the second input dimension.
+ *      width_output is the size of the first output dimension.
+ *      height_output is the size of the second output dimension.
+ *      kernel_x and kernel_y are the convolution sizes in x and y.
+ *      stride_x and stride_y is the input stride of the first and second dimension.
+ *
+ * The weights used by Transpose convolution are supposed to be the same as the ones used for
+ * Convolution. Therefore, it will be necessary to use the weights in the
+ * reverse order to perform an actual convolution. This is achieved by using the @ref
+ * CPPFlipWeightsKernel.
+ *
+ * This function calls the following NEON kernels/functions:
+ *
+ * -# @ref CPPUpsample
+ * -# @ref NEConvolutionLayer
+ *
+ */
+class NETransposeConvLayer : public IFunction
+{
+public:
+  /** Default constructor */
+  NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NETransposeConvLayer(const NETransposeConvLayer &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NETransposeConvLayer &operator=(const NETransposeConvLayer &) = delete;
+  /** Allow instances of this class to be moved */
+  NETransposeConvLayer(NETransposeConvLayer &&) = default;
+  /** Allow instances of this class to be moved */
+  NETransposeConvLayer &operator=(NETransposeConvLayer &&) = default;
+  /** Default destructor */
+  virtual ~NETransposeConvLayer() = default;
+
+  /** Set the input, weights, biases and output tensors.
+   *
+   * @param[in,out] input   Input tensor. 3 lower dimensions represent a single input, and an
+   * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8.
+   * @param[in]     weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
+   * supported: Same as @p input.
+   * @param[in]     bias    Optional, ignored if NULL. The biases have one dimension. Data type
+   * supported: Data types supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input.
+   * @param[out]    output  Output tensor. The output has the same number of dimensions as the @p
+   * input.
+   * @param[in]     info    Contains padding and policies to be used in the deconvolution, this is
+   * decribed in @ref PadStrideInfo.
+   * @param[in]     invalid_right  The number of zeros added to right edge of the output.
+   * @param[in]     invalid_bottom The number of zeros added to top edge of the output.
+   *
+   */
+  void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
+                 const PadStrideInfo &info, unsigned int invalid_right,
+                 unsigned int invalid_bottom);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NETransposeConvLayer
+   *
+   * @param[in] input   Input tensor info. 3 lower dimensions represent a single input, and an
+   * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8.
+   * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type
+   * supported: Same as @p input.
+   * @param[in] bias    (Optional) The biases have one dimension. Data type supported: Data types
+   * supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input.
+   * @param[in] output  Output tensor info. The output has the same number of dimensions as the @p
+   * input.
+   * @param[in] info    Contains padding and policies to be used in the deconvolution, this is
+   * decribed in @ref PadStrideInfo.
+   * @param[in] innvalid_right  The number of zeros added to right edge of the output.
+   * @param[in] invalid_bottom  The number of zeros added to top edge of the output.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *bias, const ITensorInfo *output,
+                         const PadStrideInfo &info, unsigned int invalid_right,
+                         unsigned int invalid_bottom);
+
+  // Inherited methods overridden:
+  void run() override;
+  void prepare() override;
+
+private:
+  MemoryGroup _memory_group;
+  NEConvolutionLayer _conv_f;
+  CPPUpsampleEx _upsample_f;
+  CPPFlipWeightsKernel _flip_weights;
+  NEPermute _permute_input;
+  NEPermute _permute_weights;
+  NEPermute _permute_output;
+  Tensor _scaled_output;
+  Tensor _weights_flipped;
+  Tensor _permuted_input;
+  Tensor _permuted_weights;
+  Tensor _permuted_output;
+  bool _is_nchw;
+  const ITensor *_original_weights;
+  ITensor *_input;
+  PadStrideInfo _info;
+  bool _is_prepared;
+};
+} // arm_compute
+#endif /* __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ */