47 files changed, 5150 insertions, 0 deletions
diff --git a/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp
new file mode 100644
index 000000000..158fe0b0c
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/CL/CLFunctionsEx.h"
+
+// NOTE This empty file aims to validate "CLFunctionsEx.h".
+//      DO NOT REMOVE this file.
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
new file mode 100644
index 000000000..ae64a6edd
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLArgOperation.h"
+
+#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+
+CLArgOperation::CLArgOperation()
+{
+  // DO NOTHING
+}
+
+void CLArgOperation::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis,
+                               ArgOperation op)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), axis, output->info(), op));
+  _input = input;
+  _output = output;
+  _axis = axis;
+  _arg_op = op;
+  // NOTE The argminmax_axis must have no duplication.
+  _num_of_kernels = axis.size();
+  const size_t num_of_interm_tensors = _num_of_kernels - 1;
+
+  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+  _argop_kernels =
+      arm_compute::support::cpp14::make_unique<CLArgOperationKernel[]>(_num_of_kernels);
+
+  TensorShape shape{input->info()->tensor_shape()};
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    shape.set(_axis[i], 1);
+    _interm_tensors[i].allocator()->init(
+        TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())
+            .set_data_layout(input->info()->data_layout()));
+    _interm_tensors[i].allocator()->allocate();
+  }
+
+  // Set a vector that is ordered ICLTensors sequentially.
+  std::vector<ICLTensor *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    tensors.emplace_back(_interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Apply ArgMinMax on all kernels
+  for (size_t i = 0; i < _num_of_kernels; i++)
+  {
+    _argop_kernels[i].configure(tensors[i], tensors[i + 1], _axis[i], op);
+  }
+}
+
+Status CLArgOperation::validate(const ITensorInfo *input, const std::vector<uint32_t> &axis,
+                                const ITensorInfo *output, ArgOperation op)
+{
+  const size_t num_of_kernels = axis.size();
+  const size_t num_of_interm_tensors = num_of_kernels - 1;
+
+  // Create temporary tensor infos
+  auto interm_tensors =
+      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+  // Create intermediate tensor info
+  TensorShape shape{input->tensor_shape()};
+
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    shape.set(axis[i], 1);
+    interm_tensors[i].set_data_type(input->data_type());
+    interm_tensors[i].set_tensor_shape(shape);
+    interm_tensors[i].set_num_channels(input->num_channels());
+  }
+
+  // Set a vector that is ordered ITensorInfo sequentially.
+  std::vector<const ITensorInfo *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    tensors.emplace_back(interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Validate argminmax only on all kernels
+  for (size_t i = 0; i < num_of_kernels; i++)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArgOperationKernel::validate(tensors[i], tensors[i + 1], axis[i], op));
+  }
+
+  return Status{};
+}
+
+void CLArgOperation::run()
+{
+  for (size_t i = 0; i < _num_of_kernels; ++i)
+  {
+    CLScheduler::get().enqueue(_argop_kernels[i]);
+  }
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
new file mode 100644
index 000000000..7c5fe5eda
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h"
+
+#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+                                  BinaryLogicalOperation op)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+  k->configure(input1, input2, output, op);
+  _kernel = std::move(k);
+
+  if (output->info()->dimension(0) > 1)
+  {
+    ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+    if (broadcasted_info->info()->dimension(0) == 1)
+    {
+      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+    }
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
new file mode 100644
index 000000000..742fc6f59
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLCast.h"
+
+#include "arm_compute/core/CL/kernels/CLCastKernel.h"
+
+using namespace arm_compute;
+
+void CLCast::configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>();
+  k->configure(input, output, input_subtype);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
new file mode 100644
index 000000000..c2e4ca9ff
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h"
+
+#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+
+using namespace arm_compute;
+
+void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>();
+  k->configure(input, output, block_size);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
new file mode 100644
index 000000000..2781784ca
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
+
+using namespace arm_compute;
+
+void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
+                                  const ICLTensor *lookups)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+  k->configure(input, output, lookups);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
new file mode 100644
index 000000000..c6b166163
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h"
+
+using namespace arm_compute;
+
+void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *input,
+                                               const arm_compute::ICLTensor *weights,
+                                               const arm_compute::ICLTensor *biases,
+                                               arm_compute::ICLTensor *output, bool needs_reshape,
+                                               const arm_compute::TensorShape &reshape)
+{
+  _input = input;
+  _weights = weights;
+  _biases = biases;
+  _output = output;
+  _needs_reshape = needs_reshape;
+
+  if (_needs_reshape)
+  {
+    // reshape
+    auto_init_if_empty(*_cl_buffer.info(),
+                       _input->info()->clone()->set_tensor_shape(reshape).set_data_layout(
+                           _input->info()->data_layout()));
+    _cl_reshape.configure(_input, &_cl_buffer);
+
+    _cl_fc.configure(&_cl_buffer, _weights, _biases, _output);
+
+    // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
+    _cl_buffer.allocator()->allocate();
+  }
+  else
+  {
+    _cl_fc.configure(_input, _weights, _biases, _output);
+  }
+}
+
+void CLFullyConnectedReshapingLayer::run(void)
+{
+  if (_needs_reshape)
+    _cl_reshape.run();
+
+  _cl_fc.run();
+}
+
+void CLFullyConnectedReshapingLayer::prepare(void) { _cl_fc.prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
new file mode 100644
index 000000000..6cad9bd2e
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLGatherEx.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGatherExKernel.h"
+
+using namespace arm_compute;
+
+void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output,
+                           int axis)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLGatherExKernel>();
+  k->configure(input, indices, output, axis);
+  _kernel = std::move(k);
+}
+
+Status CLGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
+                            const ITensorInfo *output, int axis)
+{
+  return CLGatherExKernel::validate(input, indices, output, axis);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
new file mode 100644
index 000000000..7180e9356
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLHashtableLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
+
+using namespace arm_compute;
+
+void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
+                                  const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>();
+  k->configure(lookups, keys, input, output, hits);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
new file mode 100644
index 000000000..86ea5a66d
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h"
+
+#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {}
+
+void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
+                                               ICLTensor *gamma, ICLTensor *beta, float epsilon)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
+  k->configure(input, output, gamma, beta, epsilon);
+  _kernel = std::move(k);
+}
+
+Status CLInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                                const ITensorInfo *gamma, const ITensorInfo *beta,
+                                                float epsilon)
+{
+  return CLInstanceNormalizationLayerKernelEx::validate(input, output, gamma, beta, epsilon);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
new file mode 100644
index 000000000..be35ea732
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLNeg.h"
+
+#include "arm_compute/core/CL/kernels/CLNegKernel.h"
+
+using namespace arm_compute;
+
+void CLNeg::configure(ICLTensor *input, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
new file mode 100644
index 000000000..38adedd10
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLPReLU.h"
+
+#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>();
+  k->configure(input, alpha, output);
+  _kernel = std::move(k);
+
+  if (output->info()->dimension(0) > 1)
+  {
+    ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha;
+
+    if (broadcasted_info->info()->dimension(0) == 1)
+    {
+      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+    }
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
new file mode 100644
index 000000000..2a34c0664
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRNNLayerEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLRNNLayerEx::CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
+      _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
+      _gemm_output(), _add_output(), _is_prepared(false)
+{
+}
+
+Status CLRNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                              const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+                              const ITensorInfo *hidden_state, const ITensorInfo *output,
+                              const ActivationLayerInfo &info)
+{
+  const int idx_width = 0;
+  const int idx_height = 1;
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
+                                      output);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
+                              recurrent_weights->dimension(idx_width));
+  ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
+                              recurrent_weights->dimension(1));
+  ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+                                                     hidden_state->tensor_shape());
+
+  auto shape_info =
+      TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1,
+                 input->data_type());
+
+  ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
+  ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(
+      ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+  ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
+
+  return Status{};
+}
+
+void CLRNNLayerEx::configure(const ICLTensor *input, const ICLTensor *weights,
+                             const ICLTensor *recurrent_weights, const ICLTensor *bias,
+                             ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+  ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayerEx::validate(input->info(), weights->info(),
+                                                    recurrent_weights->info(), bias->info(),
+                                                    hidden_state->info(), output->info(), info));
+
+  const int idx_height = 1;
+  TensorShape shape =
+      compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+
+  _is_prepared = false;
+
+  _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+  _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+
+  // Manage intermediate buffers and configure
+  _memory_group.manage(&_fully_connected_out);
+  _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+
+  _memory_group.manage(&_gemm_output);
+  _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+
+  _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+  _memory_group.manage(&_add_output);
+
+  _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output,
+                        &_add_output, ConvertPolicy::SATURATE);
+
+  _fully_connected_out.allocator()->allocate();
+  _gemm_output.allocator()->allocate();
+
+  _activation_kernel.configure(&_add_output, hidden_state, info);
+  _add_output.allocator()->allocate();
+
+  _copy_kernel.configure(hidden_state, output);
+}
+
+void CLRNNLayerEx::run()
+{
+  prepare();
+
+  _memory_group.acquire();
+
+  _fully_connected_kernel.run();
+  _gemm_state_f.run();
+  CLScheduler::get().enqueue(_add_kernel);
+  CLScheduler::get().enqueue(_activation_kernel);
+
+  // copy hidden out to output
+  CLScheduler::get().enqueue(_copy_kernel);
+
+  _memory_group.release();
+}
+
+void CLRNNLayerEx::prepare()
+{
+  if (!_is_prepared)
+  {
+    _fully_connected_kernel.prepare();
+    _gemm_state_f.prepare();
+
+    _is_prepared = true;
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
new file mode 100644
index 000000000..13a25c901
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLReduceOperation.h"
+
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLReduceOperation::CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(),
+      _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape()
+{
+}
+
+Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                   const std::set<uint32_t> &axis, bool keep_dims,
+                                   const ReduceOperation &op)
+{
+  const size_t num_of_kernels = axis.size();
+  const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
+
+  // Create temporary tensor infos
+  auto interm_tensors =
+      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+  // Create intermediate tensor info
+  TensorShape shape{input->tensor_shape()};
+
+  auto it = axis.begin();
+  for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it)
+  {
+    shape.set(*it, 1, false);
+    interm_tensors[i].set_data_type(input->data_type());
+    interm_tensors[i].set_tensor_shape(shape);
+    interm_tensors[i].set_num_channels(input->num_channels());
+    interm_tensors[i].set_data_layout(input->data_layout());
+    interm_tensors[i].set_quantization_info(input->quantization_info());
+  }
+
+  // Set a vector that is ordered ITensorInfo sequentially.
+  std::vector<const ITensorInfo *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; ++i)
+  {
+    tensors.emplace_back(interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Validate ReduceOperation only on all kernels
+  it = axis.begin();
+  for (size_t i = 0; i < num_of_kernels; ++i, ++it)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op));
+  }
+
+  if (!keep_dims)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output));
+  }
+
+  return Status{};
+}
+
+void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
+                                  const std::set<uint32_t> &axis, bool keep_dims,
+                                  ReduceOperation op)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, keep_dims, op));
+
+  _axis = axis;
+
+  _input = input;
+  _output = output;
+  _keep_dims = keep_dims;
+
+  // NOTE The axis must have no duplication.
+  const size_t num_of_kernels = axis.size();
+  const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
+
+  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+  _reduce_kernels =
+      arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+
+  // Set a vector that is ordered ICLTensors sequentially.
+  std::vector<ICLTensor *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; ++i)
+  {
+    tensors.emplace_back(_interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Apply ReduceOperation on all kernels
+  TensorShape shape{input->info()->tensor_shape()};
+  auto it = axis.begin();
+  for (size_t i = 0; i < num_of_kernels; ++i, ++it)
+  {
+    shape.set(*it, 1, false);
+    if (!keep_dims || i != (num_of_kernels - 1))
+    {
+      _interm_tensors[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+      _memory_group.manage(&_interm_tensors[i]);
+    }
+    _reduce_kernels[i].configure(tensors[i], tensors[i + 1], *it, op);
+    if (i != 0)
+    {
+      _interm_tensors[i - 1].allocator()->allocate();
+    }
+  }
+
+  // Configure reshape layer if we want to drop the dimensions
+  if (!keep_dims)
+  {
+    _reshape.configure(&_interm_tensors[num_of_interm_tensors - 1], output);
+    _interm_tensors[num_of_interm_tensors - 1].allocator()->allocate();
+  }
+}
+
+void CLReduceOperation::run()
+{
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  const size_t num_of_kernels = _axis.size();
+  for (size_t i = 0; i < num_of_kernels; ++i)
+  {
+    CLScheduler::get().enqueue(_reduce_kernels[i]);
+  }
+
+  if (!_keep_dims)
+  {
+    _reshape.run();
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
new file mode 100644
index 000000000..c03826891
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSpaceToBatchND.h"
+
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
+
+using namespace arm_compute;
+
+void CLSpaceToBatchND::configure(const ICLTensor *input, const ICLTensor *block_size,
+                                 const ICLTensor *padding_size, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLSpaceToBatchNDKernel>();
+  k->configure(input, block_size, padding_size, output);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
new file mode 100644
index 000000000..0f455f96f
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h"
+
+#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+
+using namespace arm_compute;
+
+void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>();
+  k->configure(input, output, block_size);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
new file mode 100644
index 000000000..80d50ad94
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLTopKV2.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "../../topk_v2.h"
+
+namespace arm_compute
+{
+
+CLTopKV2::CLTopKV2()
+    : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0),
+      _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(),
+      _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(),
+      _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr),
+      _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(),
+       _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(),
+       _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(),
+       _reorder_negatives_kernel(), _store_kernel()*/
+{
+}
+
+void CLTopKV2::configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices,
+                         int total_bits, int bits)
+{
+  _total_bits = total_bits;
+  _bits = bits;
+  _n = input->info()->tensor_shape()[0];
+
+  // _total_bits should be divided by _bits.
+  ARM_COMPUTE_ERROR_ON((_total_bits % _bits) != 0);
+
+  _k = k;
+  _radix = 1 << bits;
+
+  _input = input;
+  _values = values;
+  _indices = indices;
+
+  std::string topk_env;
+
+// Disable GPU implementation
+// TODO Enable GPU implementation with verification, or remove code
+//      Invalid result on GPU
+#if 0
+  char *env = getenv("ACL_TOPKV2");
+  if (env)
+    topk_env = env;
+
+  if (topk_env == "GPU_SINGLE")
+  {
+    _qs_idx_buf = cl::Buffer(CLScheduler::get().context(),
+                             CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+    _qs_temp_buf = cl::Buffer(CLScheduler::get().context(),
+                              CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+
+    _qs_kernel.configure(input, values, indices, &_qs_idx_buf, &_qs_temp_buf, k, _n);
+  }
+  else if (topk_env == "GPU")
+  {
+    // n should be divided by (_GROUPS * _ITEMS)
+    ARM_COMPUTE_ERROR_ON((_n % (_GROUPS * _ITEMS)) != 0);
+
+    _hist_buf_size = _radix * _GROUPS * _ITEMS;
+    _glob_sum_buf_size = _HISTOSPLIT;
+
+    _hist_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                           sizeof(cl_int) * _hist_buf_size);
+    _glob_sum_buf =
+        cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                   sizeof(cl_int) * _glob_sum_buf_size);
+    _temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                           sizeof(cl_int) * _glob_sum_buf_size);
+    _first_negative_idx_buf = cl::Buffer(CLScheduler::get().context(),
+                                         CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int));
+    _in_key_buf = cl::Buffer(CLScheduler::get().context(),
+                             CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n);
+    _out_key_buf = cl::Buffer(CLScheduler::get().context(),
+                              CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n);
+    _in_ind_buf = cl::Buffer(CLScheduler::get().context(),
+                             CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+    _out_ind_buf = cl::Buffer(CLScheduler::get().context(),
+                              CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n);
+
+    _p_in_key_buf = &_in_key_buf;
+    _p_out_key_buf = &_out_key_buf;
+    _p_in_ind_buf = &_in_ind_buf;
+    _p_out_ind_buf = &_out_ind_buf;
+
+    _init_kernel.configure(input, _p_in_key_buf, _p_in_ind_buf, _n);
+    _hist_kernel.configure(&_hist_buf, bits, _n);
+    _scan_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
+    _glob_scan_hist_kernel.configure(&_glob_sum_buf, &_temp_buf, bits);
+    _paste_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits);
+    _reorder_kernel.configure(&_hist_buf, bits, _n);
+    _find_first_negative_kernel.configure(&_first_negative_idx_buf, _n);
+    _reorder_negatives_kernel.configure(&_first_negative_idx_buf, _n);
+    _store_kernel.configure(values, indices, k, _n);
+  }
+  else
+#endif // Disable GPU implementation
+  {
+    // DO NOTHING for CPU.
+  }
+}
+
+void CLTopKV2::run()
+{
+  std::string topk_env;
+#if 0
+  char *env = getenv("ACL_TOPKV2");
+  if (env)
+    topk_env = env;
+
+  if (topk_env == "GPU_SINGLE")
+  {
+    run_on_gpu_single_quicksort();
+  }
+  else if (topk_env == "GPU")
+  {
+    run_on_gpu();
+  }
+  else
+#endif
+  {
+    run_on_cpu();
+  }
+}
+
+#if 0
+void CLTopKV2::run_on_gpu_single_quicksort()
+{
+  // This is a single threaded quick sort implementation.
+  CLScheduler::get().enqueue(_qs_kernel, false);
+
+  arm_compute::CLScheduler::get().sync();
+}
+
+void CLTopKV2::run_on_gpu()
+{
+  cl::CommandQueue q = CLScheduler::get().queue();
+
+  // 1. CLTopKV2Init set key buffer and index buffer.
+  //  - Key buffer is set as the same value of the layer's input
+  //  - Values in the index buffer are set as their indices.
+  CLScheduler::get().enqueue(_init_kernel, false);
+
+  int n_passes = _total_bits / _bits;
+
+  // 2. Repeat (total_bits/bits) times.
+  //   - total_bits is the number of bits of the data type (e.g., 32 for float)
+  //   - bits defines number of buckets (e.g. 16 buckets where bit is 4)
+  for (int pass = 0; pass < n_passes; ++pass)
+  {
+    arm_compute::CLScheduler::get().sync();
+
+    // 2.1. Calculate histogram with _GROUPS * _ITEMS threads
+    _hist_kernel.setPass(pass, _p_in_key_buf);
+    CLScheduler::get().enqueue(_hist_kernel, false);
+
+    // 2.2. Calculate prefix sum locally with multiple threads
+    CLScheduler::get().enqueue(_scan_hist_kernel, false);
+    // 2.3. Calculate prefix sum within a work group
+    CLScheduler::get().enqueue(_glob_scan_hist_kernel, false);
+    // 2.4. Calculate global prefix sum
+    CLScheduler::get().enqueue(_paste_hist_kernel, false);
+
+    // 2.5. Reorder keys and indices based on the global prefix sum
+    _reorder_kernel.setPass(pass, _p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, _p_out_ind_buf);
+    CLScheduler::get().enqueue(_reorder_kernel, false);
+
+    cl::Buffer *tmp;
+    // swap key buffers
+    tmp = _p_in_key_buf;
+    _p_in_key_buf = _p_out_key_buf;
+    _p_out_key_buf = tmp;
+
+    // swap index buffers
+    tmp = _p_in_ind_buf;
+    _p_in_ind_buf = _p_out_ind_buf;
+    _p_out_ind_buf = tmp;
+  }
+
+  // 3. Get the first negative index
+  // Because we swap in_buf and out_buf at the end of the above for loop,
+  // the output buffers are in bufs.
+  _find_first_negative_kernel.setOutputBuffer(_p_in_key_buf);
+  CLScheduler::get().enqueue(_find_first_negative_kernel, false);
+
+  // 4. Correct odering of negatives
+  //   - Since radix sort does not consider negatives, negatives are considered as bigger values
+  //   than positives.
+  // reordered data will be stored in _p_out_key_buf and _p_out_ind_buf
+  _reorder_negatives_kernel.setBuffers(_p_in_key_buf, _p_out_key_buf, _p_in_ind_buf,
+                                       _p_out_ind_buf);
+  CLScheduler::get().enqueue(_reorder_negatives_kernel, false);
+
+  // 5. Extract top k values from sorted keys and indices.
+  _store_kernel.setOutputBuffers(_p_out_key_buf, _p_out_ind_buf);
+  CLScheduler::get().enqueue(_store_kernel, false);
+
+  arm_compute::CLScheduler::get().sync();
+
+#if 0
+  // below code is left for debugging.
+  int first_neg;
+  q.enqueueReadBuffer(_first_negative_idx_buf, CL_TRUE, 0, sizeof(cl_int), &first_neg);
+  std::cout << "first neg = " << first_neg << std::endl;
+
+  float in_key[_n];
+  q.enqueueReadBuffer(*_p_in_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, in_key);
+  for(uint32_t i = 0 ; i < _n; ++i) {
+    std::cout << "in_key[" << i << "] = " << in_key[i] << std::endl;
+  }
+
+  float out_key[_n];
+  q.enqueueReadBuffer(*_p_out_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, out_key);
+  for(uint32_t i = 0 ; i < _n; ++i) {
+    std::cout << "out_key[" << i << "] = " << out_key[i] << std::endl;
+  }
+
+  int in_ind[_n];
+  q.enqueueReadBuffer(*_p_in_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, in_ind);
+  for(uint32_t i = 0 ; i < _n; ++i) {
+    std::cout << "in_ind[" << i << "] = " << in_ind[i] << std::endl;
+  }
+
+  int out_ind[_n];
+  q.enqueueReadBuffer(*_p_out_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, out_ind);
+  for(uint32_t i = 0 ; i < _n; ++i) {
+    std::cout << "out_ind[" << i << "] = " << out_ind[i] << std::endl;
+  }
+
+  int hist_buf[_hist_buf_size];
+  q.enqueueReadBuffer(_hist_buf, CL_TRUE, 0, sizeof(cl_int)*_hist_buf_size, hist_buf);
+  for(uint32_t i = 0 ; i < _hist_buf_size; ++i) {
+    std::cout << "hist_buf[" << i << "] = " << hist_buf[i] << std::endl;
+  }
+
+  int glob_sum_buf[_glob_sum_buf_size];
+  q.enqueueReadBuffer(_glob_sum_buf, CL_TRUE, 0, sizeof(cl_int)*_glob_sum_buf_size, glob_sum_buf);
+  for(uint32_t i = 0 ; i < _glob_sum_buf_size; ++i) {
+    std::cout << "glob_sum_buf[" << i << "] = " << glob_sum_buf[i] << std::endl;
+  }
+
+#endif
+}
+#endif // Disable GPU implementation
+
+void CLTopKV2::run_on_cpu()
+{
+  cl::CommandQueue q = CLScheduler::get().queue();
+  // const Window& w = _topkv2_kernel.window();
+
+  _input->map(q);
+  _values->map(q);
+  _indices->map(q);
+
+  // int row_size = (w[0].end() - w[0].start()) / w[0].step();
+  int row_size = _input->info()->tensor_shape()[0];
+  int rank = _input->info()->num_dimensions();
+
+  if (rank > 2)
+    throw std::runtime_error("Not supported type.");
+
+  int row_num = (rank == 2 ? _input->info()->tensor_shape()[1] : 1);
+
+  if (_input->info()->data_type() == DataType::F32)
+  {
+    nnfw::rt::optimized_ops::TopK<float>(row_size, row_num, (float *)_input->buffer(), _k,
+                                         (int32 *)_indices->buffer(), (float *)_values->buffer());
+  }
+  else if (_input->info()->data_type() == DataType::S32)
+  {
+    nnfw::rt::optimized_ops::TopK<int32_t>(row_size, row_num, (int32_t *)_input->buffer(), _k,
+                                           (int32 *)_indices->buffer(),
+                                           (int32_t *)_values->buffer());
+  }
+  else if (_input->info()->data_type() == DataType::QASYMM8)
+  {
+    nnfw::rt::optimized_ops::TopK<uint8_t>(row_size, row_num, (uint8_t *)_input->buffer(), _k,
+                                           (int32 *)_indices->buffer(),
+                                           (uint8_t *)_values->buffer());
+  }
+  else
+  {
+    throw std::runtime_error("Not supported type.");
+  }
+
+  _input->unmap(q);
+  _values->unmap(q);
+  _indices->unmap(q);
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
new file mode 100644
index 000000000..40e21671d
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/UtilsEx.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _scale_f(),
+      _conv_f(),
+      _flip_weights(),
+      _scaled_output(),
+      _original_weights(nullptr),
+      _weights_flipped(),
+      _is_prepared(false)
+{
+}
+
+Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                      const ITensorInfo *bias, ITensorInfo *output,
+                                      const PadStrideInfo &info, unsigned int invalid_right,
+                                      unsigned int invalid_bottom, const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+  const DataLayout data_layout = input->data_layout();
+
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
+
+  const unsigned int kernel_x = weights->dimension(idx_w);
+  const unsigned int kernel_y = weights->dimension(idx_h);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_right > kernel_x - 1,
+                                  "invalid_right must be smaller than kernel_x");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_bottom > kernel_y - 1,
+                                  "inner_border_top must be smaller than kernel_y");
+
+  // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were added.
+  auto out_dims = transposeconv_output_dimensions(
+      input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
+      weights->dimension(idx_h), info, invalid_right, invalid_bottom);
+
+  const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+
+  if (bias != nullptr)
+  {
+    if (is_data_type_quantized_asymmetric(input->data_type()))
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+    }
+    else
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
+                                  "Output's width is invalid.");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
+                                  "Output's height is invalid.");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
+                                  "Output's depth is invalid.");
+
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
+      pad_bottom);
+  TensorInfo scale_out_info(input->clone()
+                                ->set_is_resizable(true)
+                                .reset_padding()
+                                .set_tensor_shape(scale_out_shape)
+                                .set_data_layout(data_layout));
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      CLTransposeConvLayerUpsample::validate(input, &scale_out_info, BorderSize(0, 0), info));
+  ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
+                                                           conv_info, weights_info));
+
+  return Status{};
+}
+
+void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
+                                     ICLTensor *output, const PadStrideInfo &info,
+                                     unsigned int invalid_right, unsigned int invalid_bottom,
+                                     const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  const unsigned int stride_x = info.stride().first;
+  const unsigned int stride_y = info.stride().second;
+
+  const DataLayout data_layout = input->info()->data_layout();
+
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+  _original_weights = weights;
+  _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+  _flip_weights.configure(weights, &_weights_flipped);
+
+  // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were
+  // added.
+  auto out_dims = transposeconv_output_dimensions(
+      input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+      weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
+      invalid_bottom);
+
+  const TensorShape output_shape =
+      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty(
+      *output->info(),
+      input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+
+  // Perform validation step
+  ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayer::validate(
+      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
+      info, invalid_right, invalid_bottom));
+
+  _is_prepared = weights_info.retain_internal_weights();
+
+  _memory_group.manage(&_scaled_output);
+
+  // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
+  // to match output shape
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+      pad_right, pad_top, pad_bottom);
+
+  TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+                            input->info()->quantization_info());
+  scale_out_info.set_data_layout(data_layout);
+  _scaled_output.allocator()->init(scale_out_info);
+
+  // configure scale function
+  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+                                    DimensionRoundingType::FLOOR);
+  _scale_f.configure(input, &_scaled_output, BorderSize(0, 0), upsample_info);
+
+  // setup the function to convolve the upscaled output
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+  _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
+  _scaled_output.allocator()->allocate();
+}
+
+void CLTransposeConvLayer::run()
+{
+  prepare();
+
+  _memory_group.acquire();
+
+  _scale_f.run();
+  _conv_f.run();
+
+  _memory_group.release();
+}
+
+void CLTransposeConvLayer::prepare()
+{
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    // Run weights flipping and mark original weights tensor as unused
+    _weights_flipped.allocator()->allocate();
+    _weights_flipped.map(true);
+    _original_weights->map(CLScheduler::get().queue(), true);
+    CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
+    _weights_flipped.unmap();
+    _original_weights->unmap(CLScheduler::get().queue());
+    _original_weights->mark_as_unused();
+
+    // Prepare convolution
+    _conv_f.prepare();
+
+    if (!_weights_flipped.is_used())
+    {
+      _weights_flipped.allocator()->free();
+    }
+
+    _is_prepared = true;
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
new file mode 100644
index 000000000..0ce3e6700
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLTransposeConvLayerUpsample::CLTransposeConvLayerUpsample() // NOLINT
+    : _upsample(),
+      _output(nullptr)
+{
+}
+
+Status CLTransposeConvLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                              const BorderSize &inner_border,
+                                              const PadStrideInfo &info)
+{
+  return CLTransposeConvLayerUpsampleKernel::validate(input, output, inner_border, info);
+}
+
+void CLTransposeConvLayerUpsample::configure(ICLTensor *input, ICLTensor *output,
+                                             const BorderSize &inner_border,
+                                             const PadStrideInfo &info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  _output = output;
+  _upsample.configure(input, _output, inner_border, info);
+}
+
+void CLTransposeConvLayerUpsample::run()
+{
+  _output->map(CLScheduler::get().queue(), true);
+  if (is_data_type_quantized_asymmetric(_output->info()->data_type()))
+  {
+    const uint8_t quantized_zero = _output->info()->quantization_info().offset;
+    std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
+  }
+  else
+  {
+    memset(_output->buffer(), 0, _output->info()->total_size());
+  }
+  _output->unmap(CLScheduler::get().queue());
+
+  CLScheduler::get().enqueue(_upsample, false);
+}
diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
new file mode 100644
index 000000000..f8e0ef8a6
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h"
+
+#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CPPUpsampleEx::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CPPUpsampleKernelEx>();
+  k->configure(input, output, info);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp
new file mode 100644
index 000000000..80fbf359d
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/NEON/NEFunctionsEx.h"
+
+// NOTE This empty file aims to validate "NEFunctionsEx.h".
+//      DO NOT REMOVE this file.
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp
new file mode 100644
index 000000000..5ba465b61
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEArgMinMax.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+
+template <ReductionOperation OP>
+NEArgMinMaxStatic<OP>::NEArgMinMaxStatic(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduction_kernel(), _reduced_out(), _reshape()
+{
+}
+
+template <ReductionOperation OP>
+Status NEArgMinMaxStatic<OP>::validate(const ITensorInfo *input, int axis,
+                                       const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+
+  TensorShape out_shape = input->tensor_shape();
+  const int input_dims = input->num_dimensions();
+  int axis_local = axis;
+
+  // Convert negative axis
+  axis_local = wrap_around(axis_local, input_dims);
+
+  ARM_COMPUTE_RETURN_ERROR_ON(axis_local > 3);
+  ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local) > input->num_dimensions() - 1);
+  out_shape.remove_dimension(axis_local);
+
+  const TensorInfo out_info = output->clone()->set_tensor_shape(out_shape);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+  return Status{};
+}
+
+template <ReductionOperation OP>
+void NEArgMinMaxStatic<OP>::configure(ITensor *input, int axis, ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  int axis_local = axis;
+  const int input_dims = input->info()->num_dimensions();
+
+  // Convert negative axis
+  axis_local = wrap_around(axis_local, input_dims);
+
+  // Perform reduction for axis
+  TensorShape intermediate_shape = input->info()->tensor_shape();
+  intermediate_shape.set(axis_local, 1);
+  auto in = input;
+
+  _reduced_out.allocator()->init(TensorInfo(intermediate_shape, output->info()->num_channels(),
+                                            output->info()->data_type(),
+                                            output->info()->quantization_info()));
+  _memory_group.manage(&_reduced_out);
+  _reduction_kernel.configure(in, axis_local, &_reduced_out, OP);
+
+  // Allocate intermediate tensor
+  _reduced_out.allocator()->allocate();
+
+  // Configure reshape layer if we want to drop the dimensions
+  TensorShape out_shape = input->info()->tensor_shape();
+  out_shape.remove_dimension(axis_local);
+  auto_init_if_empty(*output->info(), output->info()->clone()->set_tensor_shape(out_shape));
+  _reshape.configure(&_reduced_out, output);
+}
+
+template <ReductionOperation OP> void NEArgMinMaxStatic<OP>::run()
+{
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  _reduction_kernel.run();
+  _reshape.run();
+}
+
+// Supported Specializations
+template class NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MAX>;
+template class NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MIN>;
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
new file mode 100644
index 000000000..7c15fc453
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h"
+#include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h>
+
+#include "arm_compute/core/ITensor.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+
+template <BinaryLogicalOperation COP>
+void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2,
+                                                    ITensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+  k->configure(COP, input1, input2, output);
+  _kernel = std::move(k);
+}
+
+template <BinaryLogicalOperation COP>
+Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1,
+                                                     const ITensorInfo *input2,
+                                                     const ITensorInfo *output)
+{
+  return NEBinaryLogicalOperationKernel::validate(COP, input1, input2, output);
+}
+
+void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output,
+                                         BinaryLogicalOperation op)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+  k->configure(op, input1, input2, output);
+  _kernel = std::move(k);
+}
+
+Status NEBinaryLogicalOperation::validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                                          const ITensorInfo *output, BinaryLogicalOperation op)
+{
+  return NEBinaryLogicalOperationKernel::validate(op, input1, input2, output);
+}
+
+// Supported Specializations
+template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>;
+template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>;
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
new file mode 100644
index 000000000..f2490e4e8
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NECast.h"
+
+#include "arm_compute/core/NEON/kernels/NECastKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NECast::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NECastKernel>();
+  k->configure(input, output, input_subtype);
+  _kernel = std::move(k);
+}
+
+Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output,
+                        SubDataType input_subtype)
+{
+  return NECastKernel::validate(input, output, input_subtype);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
new file mode 100644
index 000000000..db419e3a8
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+void NEDepthToSpaceLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEDepthToSpaceLayerKernelEx>();
+  k->configure(input, output, block_shape);
+  _kernel = std::move(k);
+}
+
+Status NEDepthToSpaceLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                       int32_t block_shape)
+{
+  return NEDepthToSpaceLayerKernelEx::validate(input, output, block_shape);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp
new file mode 100644
index 000000000..a95018a28
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h"
+
+#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NENegLayer::configure(const ITensor *input, ITensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernelEx>();
+  k->configure(ElementWiseUnaryEx::NEG, input, output);
+  _kernel = std::move(k);
+}
+Status NENegLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+  return NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx::NEG, input, output);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
new file mode 100644
index 000000000..00c3ed94f
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h"
+
+#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEEmbeddingLookupKernel>();
+  k->configure(input, output, lookups);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
new file mode 100644
index 000000000..d604fedbf
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      NEGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output));
+
+  return Status{};
+}
+} // namespace
+
+void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
+}
+
+Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input,
+                                                           const ITensorInfo *output)
+{
+  return NETransposeKernel::validate(input, output);
+}
+
+NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer(
+    std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(),
+      _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(),
+      _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false),
+      _accumulate_biases(false), _is_prepared(false)
+{
+}
+
+void NEFullyConnectedHybridLayer::configure_mm(const ITensor *input, const ITensor *weights,
+                                               ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+  // Configure gemmlowp function
+  _mm_gemmlowp.configure(input, weights, nullptr, output);
+}
+
+void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor *weights,
+                                            const ITensor *biases, ITensor *output,
+                                            FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  // Perform validate step
+  ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedHybridLayer::validate(
+      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+      fc_info));
+
+  _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  _accumulate_biases = false;
+  _original_weights = weights;
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr)
+  {
+    _accumulate_biases = true;
+
+    // Configure accumulate biases kernel
+    _accumulate_biases_kernel.configure(output, biases);
+  }
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensor *weights_to_use = weights;
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+  bool _is_fc_after_conv;
+  if (is_batched_fc_layer)
+  {
+    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                    input->info()->tensor_shape().cend(),
+                                    output->info()->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    _is_fc_after_conv = input->info()->num_dimensions() > 1 && input->info()->dimension(1) > 1;
+  }
+  ARM_COMPUTE_ERROR_ON_MSG(_is_fc_after_conv,
+                           "NEFullyConnectedHybridLayer does not support after conv");
+  (void)_is_fc_after_conv;
+
+  // Reshape weights if needed
+  if (!_are_weights_reshaped)
+  {
+    // Reshape the weights
+    _reshape_weights_output.allocator()->init(
+        weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+            compute_transposed_shape(*weights->info())));
+    _reshape_weights_function.configure(weights_to_use, &_reshape_weights_output);
+    weights_to_use = &_reshape_weights_output;
+  }
+
+  // Quantize input
+  _quantized_input.allocator()->init(
+      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+  _scale_factor.allocator()->init(
+      TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32));
+  _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor);
+
+  // GEMM
+  _gemmlowp_output.allocator()->init(
+      output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+  configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output);
+
+  // Multiply scale
+  _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output,
+                                   weights->info()->quantization_info().scale);
+
+  _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+
+  _quantized_input.allocator()->allocate();
+  _scale_factor.allocator()->allocate();
+  _gemmlowp_output.allocator()->allocate();
+}
+
+Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                             const ITensorInfo *biases, const ITensorInfo *output,
+                                             FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+  ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
+
+  bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+
+  const ITensorInfo &reshaped_weights =
+      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          compute_transposed_shape(*weights)));
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
+  }
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensorInfo *weights_to_use = weights;
+
+  if (!weights_reshaped)
+  {
+    // Validate reshape weights kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
+    weights_to_use = &reshaped_weights;
+  }
+
+  // Fully Connected layer after a Fully Connected Layer without batches
+  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+
+  // Validate quantization kernel
+  const ITensorInfo &quantized_input = TensorInfo(
+      input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+  const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32);
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor));
+
+  const ITensorInfo &gemmlowp_output = TensorInfo(
+      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+  // Validate matrix multiply kernel
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output));
+
+  ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate(
+      &gemmlowp_output, &scale_factor, output, weights->quantization_info().scale));
+
+  return Status{};
+}
+
+void NEFullyConnectedHybridLayer::run()
+{
+  prepare();
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Quantize input
+  NEScheduler::get().schedule(&_quant_input_kernel, Window::DimY);
+
+  // Run matrix multiply
+  _mm_gemmlowp.run();
+
+  // Multiply scale factor
+  NEScheduler::get().schedule(&_multiply_scale_kernel, Window::DimY);
+
+  // Accumulate biases if provided
+  if (_accumulate_biases)
+  {
+    NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+  }
+}
+
+void NEFullyConnectedHybridLayer::prepare()
+{
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    auto release_unused = [](Tensor *w) {
+      if (!w->is_used())
+      {
+        w->allocator()->free();
+      }
+    };
+
+    // Reshape of the weights (happens only once)
+    if (!_are_weights_reshaped)
+    {
+      // Run reshape weights kernel and mark weights as unused
+      _reshape_weights_output.allocator()->allocate();
+      _reshape_weights_function.run();
+
+      _are_weights_reshaped = true;
+      // We can not release _original_weights because it can be used in other nodes
+    }
+
+    // Prepare GEMM prepare and release unused weights
+    _mm_gemmlowp.prepare();
+
+    // Release reshaped weights if unused
+    release_unused(&_reshape_weights_output);
+
+    _is_prepared = true;
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
new file mode 100644
index 000000000..a944f699a
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+{
+  if (is_data_type_quantized_asymmetric(input.data_type()))
+  {
+    // Since we need negative offsets for computing convolution, we need to change
+    // QuantizationInfo()
+    // Extract and negate input and weights offset
+    const QuantizationInfo input_quantization_info(input.quantization_info().scale,
+                                                   -input.quantization_info().offset);
+    const QuantizationInfo weights_quantization_info(weights.quantization_info().scale,
+                                                     -weights.quantization_info().offset);
+
+    // Validate gemmlowp function
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(
+        &input.clone()->set_quantization_info(input_quantization_info),
+        &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output));
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(
+        &input, &weights, nullptr, &output, 1.f, 0.0f,
+        GEMMInfo(false, false, false /* Reshape weights only for the first run */)));
+  }
+
+  return Status{};
+}
+} // namespace
+
+NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(),
+      _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(),
+      _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(),
+      _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr),
+      _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false),
+      _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
+{
+}
+
+void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *weights,
+                                           ITensor *output)
+{
+  if (_is_quantized)
+  {
+    // Since we need negative offsets for computing convolution, we need to change
+    // QuantizationInfo()
+    // Extract and negate input and weights offset
+    const QuantizationInfo input_quantization_info = input->info()->quantization_info();
+    const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
+
+    input->info()->set_quantization_info(
+        QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
+    weights->info()->set_quantization_info(
+        QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+
+    // Configure gemmlowp function
+    _mm_gemmlowp.configure(input, weights, nullptr, output);
+
+    // Revert back QuantizatioInfo as input and weights could be used in other fully connected
+    // layers
+    input->info()->set_quantization_info(input_quantization_info);
+    weights->info()->set_quantization_info(weights_quantization_info);
+  }
+  else
+  {
+    // Configure matrix multiply kernel
+    _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f,
+                       GEMMInfo(false, false, false /* Reshape weights only for the first run */));
+  }
+}
+
+void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITensor *weights,
+                                                ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON(
+      (weights->info()->dimension(1) !=
+       (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+  // If the fully connected layer is called after a convolution layer, the input tensor must be
+  // linearized
+
+  // Initialize output tensor for flatten
+  TensorShape shape_flatten = compute_flatten_shape(input->info());
+  _flatten_output.allocator()->init(
+      input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          shape_flatten));
+
+  // Configure flatten kernel
+  _memory_group.manage(&_flatten_output);
+  _flatten_kernel.configure(input, &_flatten_output);
+
+  // Configure matrix multiply kernel
+  configure_mm(&_flatten_output, weights, output);
+
+  // Allocate the output tensor for flatten once all the configure methods have been called
+  _flatten_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayerEx::configure_fc_fc(const ITensor *input, const ITensor *weights,
+                                              ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+  // Configure matrix multiply kernel
+  configure_mm(input, weights, output);
+}
+
+void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *weights,
+                                        const ITensor *biases, ITensor *output,
+                                        FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  // Perform validate step
+  ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate(
+      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+      fc_info));
+
+  _are_weights_converted = true;
+  _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  _is_fc_after_conv = true;
+  _accumulate_biases = false;
+  _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+  _original_weights = weights;
+
+  // Configure gemmlowp output
+  if (_is_quantized)
+  {
+    _gemmlowp_output.allocator()->init(
+        output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+            DataType::S32));
+  }
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr && !_is_quantized)
+  {
+    _accumulate_biases = true;
+
+    // Configure accumulate biases kernel
+    _accumulate_biases_kernel.configure(output, biases);
+  }
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensor *weights_to_use = weights;
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+  if (is_batched_fc_layer)
+  {
+    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                    input->info()->tensor_shape().cend(),
+                                    output->info()->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    _is_fc_after_conv = input->info()->num_dimensions() > 1;
+  }
+
+  // Reshape weights if needed
+  if (!_are_weights_reshaped)
+  {
+    // Reshape the weights
+    _reshape_weights_function.configure(weights, &_reshape_weights_output);
+    weights_to_use = &_reshape_weights_output;
+  }
+
+  // Convert weights if needed
+  if (_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
+  {
+    // Convert weights
+    _convert_weights.configure(weights_to_use, &_converted_weights_output,
+                               input->info()->tensor_shape(), fc_info.weights_trained_layout);
+
+    weights_to_use = &_converted_weights_output;
+    _are_weights_converted = false;
+  }
+
+  ITensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output;
+  if (_is_fc_after_conv)
+  {
+    // Fully Connected layer after a Convolution Layer without batches
+    configure_conv_fc(input, weights_to_use, tmp_output);
+  }
+  else
+  {
+    // Fully Connected layer after a Fully Connected Layer without batches
+    configure_fc_fc(input, weights_to_use, tmp_output);
+  }
+
+  // Configure output stage for asymmetric quantized types
+  if (_is_quantized)
+  {
+    float multiplier = input->info()->quantization_info().scale *
+                       weights->info()->quantization_info().scale /
+                       output->info()->quantization_info().scale;
+    int output_multiplier;
+    int output_shift;
+    quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier,
+                                                               &output_shift);
+    _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier,
+                                     output_shift, output->info()->quantization_info().offset);
+    _gemmlowp_output.allocator()->allocate();
+  }
+
+  _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+}
+
+Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                         const ITensorInfo *biases, const ITensorInfo *output,
+                                         FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+
+  bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  bool is_fc_after_conv = true;
+  bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+
+  const ITensorInfo &flatten_input =
+      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          compute_flatten_shape(input)));
+  const ITensorInfo &reshaped_weights =
+      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          compute_transposed_shape(*weights)));
+  const ITensorInfo &converted_weights =
+      weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+                       : TensorInfo(*reshaped_weights.clone());
+  const ITensorInfo &gemmlowp_output = TensorInfo(
+      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr && !is_quantized)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
+  }
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensorInfo *input_to_use = input;
+  const ITensorInfo *weights_to_use = weights;
+  const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output;
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->dimension(1) > 1;
+
+  if (is_batched_fc_layer)
+  {
+    is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                       (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(),
+                                   output->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    is_fc_after_conv = input->num_dimensions() > 1;
+  }
+
+  if (!weights_reshaped)
+  {
+    // Validate reshape weights kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
+    weights_to_use = &reshaped_weights;
+  }
+
+  if (is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
+  {
+    // Validate convert weights kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(
+        weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
+    weights_to_use = &converted_weights;
+  }
+
+  if (is_fc_after_conv)
+  {
+    // Fully Connected layer after a Convolution Layer without batches
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        (weights_to_use->dimension(1) !=
+         (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+
+    // Validate flatten kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input));
+    input_to_use = &flatten_input;
+  }
+  else
+  {
+    // Fully Connected layer after a Fully Connected Layer without batches
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+  }
+  // Validate matrix multiply kernel
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output));
+
+  // Validate output stage for asymmetric quantized types
+  if (is_quantized)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(
+        &gemmlowp_output, biases, output));
+  }
+
+  return Status{};
+}
+
+void NEFullyConnectedLayerEx::run()
+{
+  if (!_is_prepared)
+  {
+    if (!_are_weights_reshaped)
+      _reshape_weights_output.allocator()->allocate();
+    if (!_are_weights_converted)
+      _converted_weights_output.allocator()->allocate();
+    _is_prepared = true;
+  }
+
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    // Reshape of the weights
+    if (!_are_weights_reshaped)
+    {
+      _reshape_weights_function.run();
+    }
+
+    // Convert weights if needed
+    if (!_are_weights_converted)
+    {
+      _convert_weights.run();
+    }
+
+    // Prepare GEMM prepare
+    if (!_is_quantized)
+    {
+      _mm_gemm.prepare();
+    }
+  }
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Linearize input if it comes from a convolutional layer
+  if (_is_fc_after_conv)
+  {
+    NEScheduler::get().schedule(&_flatten_kernel, Window::DimY);
+  }
+
+  // Run matrix multiply
+  if (_is_quantized)
+  {
+    _mm_gemmlowp.run();
+  }
+  else
+  {
+    _mm_gemm.run();
+  }
+
+  // Accumulate biases if provided
+  if (_is_quantized)
+  {
+    _gemmlowp_output_stage.run();
+  }
+  else
+  {
+    if (_accumulate_biases)
+    {
+      NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+    }
+  }
+}
+
+void NEFullyConnectedLayerEx::prepare()
+{
+#if 0 // TODO Remove this block
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    auto release_unused = [](Tensor *w) {
+      if (!w->is_used())
+      {
+        w->allocator()->free();
+      }
+    };
+
+    // Pointer to current weights
+    const ITensor *cur_weights = _original_weights;
+
+    // Reshape of the weights (happens only once)
+    if (!_are_weights_reshaped)
+    {
+      // Run reshape weights kernel and mark weights as unused
+      _reshape_weights_output.allocator()->allocate();
+      _reshape_weights_function.run();
+
+      cur_weights->mark_as_unused();
+      cur_weights = &_reshape_weights_output;
+      _are_weights_reshaped = true;
+    }
+
+    // Convert weights if needed (happens only once)
+    if (!_are_weights_converted)
+    {
+      _converted_weights_output.allocator()->allocate();
+      _convert_weights.run();
+
+      cur_weights->mark_as_unused();
+      _are_weights_converted = true;
+    }
+
+    // Release reshaped weights if unused
+    release_unused(&_reshape_weights_output);
+
+    // Prepare GEMM prepare and release unused weights
+    if (!_is_quantized)
+    {
+      _mm_gemm.prepare();
+    }
+
+    // Release converted weights if unused
+    release_unused(&_reshape_weights_output);
+    release_unused(&_converted_weights_output);
+
+    _is_prepared = true;
+  }
+#endif
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
new file mode 100644
index 000000000..fcac3c7ae
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h"
+
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h>
+
+using namespace arm_compute;
+
+void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input,
+                                               const arm_compute::ITensor *weights,
+                                               const arm_compute::ITensor *biases,
+                                               arm_compute::ITensor *output, bool needs_reshape,
+                                               const arm_compute::TensorShape &reshape,
+                                               KernelType kernel_type)
+{
+  _input = input;
+  _weights = weights;
+  _biases = biases;
+  _output = output;
+  _needs_reshape = needs_reshape;
+
+  const ITensor *input_to_use = input;
+  if (_needs_reshape)
+  {
+    // reshape
+    auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
+    _neon_reshape.configure(_input, &_neon_buffer);
+    input_to_use = &_neon_buffer;
+  }
+
+  _neon_fc = [&]() {
+    if (kernel_type == KernelType::GENERAL)
+    {
+      auto fc = new arm_compute::NEFullyConnectedLayerEx{_memory_manager};
+      fc->configure(input_to_use, _weights, _biases, _output);
+      return std::unique_ptr<arm_compute::IFunction>(fc);
+    }
+    else
+    {
+      assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
+
+      bool is_hybrid = input->info()->data_type() == DataType::F32 &&
+                       weights->info()->data_type() == DataType::S8;
+
+      if (is_hybrid)
+      {
+        auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager};
+        fc->configure(input_to_use, _weights, _biases, _output);
+        return std::unique_ptr<arm_compute::IFunction>(fc);
+      }
+      else
+      {
+        auto fc = new arm_compute::NEFullyConnectedLayer{_memory_manager};
+        fc->configure(input_to_use, _weights, _biases, _output);
+        return std::unique_ptr<arm_compute::IFunction>(fc);
+      }
+    }
+  }();
+
+  // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
+  if (_needs_reshape)
+  {
+    _neon_buffer.allocator()->allocate();
+  }
+}
+
+void NEFullyConnectedReshapingLayer::run(void)
+{
+  if (_needs_reshape)
+    _neon_reshape.run();
+
+  _neon_fc->run();
+}
+
+void NEFullyConnectedReshapingLayer::prepare(void) { _neon_fc->prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
new file mode 100644
index 000000000..11794a1ea
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+NEGEMMLowpMatrixMultiplyCoreEx::NEGEMMLowpMatrixMultiplyCoreEx(
+    std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr),
+      _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
+      _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
+      _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(),
+      _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0),
+      _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),
+      _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false),
+      _fuse_output_stage(false), _run_activation(false), _flip_signedness(false)
+{
+}
+
+void NEGEMMLowpMatrixMultiplyCoreEx::configure(const ITensor *a, const ITensor *b, const ITensor *c,
+                                               ITensor *output, const GEMMInfo &gemm_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+  ARM_COMPUTE_UNUSED(c);
+  ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCoreEx::validate(
+      a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
+
+  const ITensor *matrix_a = a;
+  const ITensor *matrix_b = b;
+  GEMMInfo info = gemm_info;
+
+  // Clear state
+  _mtx_a_reshape_kernel = nullptr;
+  _mtx_b_reshape_kernel = nullptr;
+
+  // Set internal variables
+  _a_offset = a->info()->quantization_info().offset;
+  _b_offset = b->info()->quantization_info().offset;
+  _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+  _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
+  _is_prepared = false;
+  _fused_assembly_path = false;
+  _original_b = b;
+
+  const ITensor *a_to_use = a;
+
+  // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+  if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+  {
+    _fuse_output_stage = true;
+    _memory_group.manage(&_mm_result_s32);
+    TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
+    _mm_result_s32.allocator()->init(info_mm_result_s32);
+  }
+
+#ifdef __aarch64__
+#if 0  // Can use after arm compute library v19.11
+  switch (a->info()->data_type())
+  {
+    case DataType::QASYMM8:
+    case DataType::QASYMM8_SIGNED:
+    case DataType::U8:
+    case DataType::S8:
+    {
+      if (a_to_use->info()->data_type() == DataType::QASYMM8 &&
+          info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+      {
+        _asm_glue.configure(a_to_use, b, c, output, gemm_info);
+        _fused_assembly_path = _asm_glue.is_configured();
+      }
+      else
+      {
+        _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output,
+                            gemm_info);
+      }
+      _assembly_path = _asm_glue.is_configured();
+      break;
+    }
+    default:
+    {
+      ARM_COMPUTE_ERROR("Datatype not supported");
+      break;
+    }
+  }
+#endif // 0
+  ARM_COMPUTE_ERROR("aarch64 not supported");
+#endif /* __aarch64__ */
+  if (!(_assembly_path || _run_vector_matrix_multiplication))
+  {
+    matrix_a = &_tmp_a;
+    matrix_b = &_tmp_b;
+
+    // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
+    // 4.0f) ]
+    TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1,
+                      a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
+    // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width /
+    // 16.0f) ]
+    TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(),
+                      b->info()->quantization_info());
+    _tmp_a.allocator()->init(a_info);
+    _tmp_b.allocator()->init(b_info);
+    _memory_group.manage(&_tmp_a);
+    if (!_reshape_b_only_on_first_run)
+    {
+      _memory_group.manage(&_tmp_b);
+    }
+
+    // Configure interleave kernel
+    {
+      auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+      k->configure(a_to_use, &_tmp_a);
+      _mtx_a_reshape_kernel = std::move(k);
+    }
+
+    // Configure transpose kernel
+    {
+      auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+      k->configure(b, &_tmp_b);
+      _mtx_b_reshape_kernel = std::move(k);
+    }
+  }
+
+  if (!_fused_assembly_path)
+  {
+    // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+    if (_a_offset != 0)
+    {
+      TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
+
+      _vector_sum_col.allocator()->init(info_vector_sum_col);
+      if (!_reshape_b_only_on_first_run)
+      {
+        _memory_group.manage(&_vector_sum_col);
+      }
+
+      // Configure Matrix B reduction kernel
+      _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);
+    }
+
+    // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+    if (_b_offset != 0)
+    {
+      TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
+
+      _vector_sum_row.allocator()->init(info_vector_sum_row);
+      _memory_group.manage(&_vector_sum_row);
+
+      // Configure matrix A reduction kernel
+      _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0),
+                                        false);
+    }
+
+    if (_fuse_output_stage)
+    {
+      // Configure matrix multiply kernel
+      if (!_assembly_path)
+      {
+        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+        k->configure(matrix_a, matrix_b, &_mm_result_s32);
+        _mm_kernel = std::move(k);
+      }
+
+      _offset_contribution_output_stage_kernel.configure(
+          &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
+          _b_offset == 0 ? nullptr : &_vector_sum_row, c,
+          _flip_signedness ? &_signed_output : output, a->info()->dimension(0), _a_offset,
+          _b_offset, info.gemmlowp_output_stage());
+    }
+    else
+    {
+      // Configure matrix multiply kernel
+      if (!_assembly_path)
+      {
+        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+        k->configure(matrix_a, matrix_b, output);
+        _mm_kernel = std::move(k);
+      }
+      // Configure offset contribution kernel
+      _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                            _b_offset == 0 ? nullptr : &_vector_sum_row,
+                                            a_to_use->info()->dimension(0), _a_offset, _b_offset);
+    }
+  }
+
+  // Allocate tensors
+  if (!_assembly_path && !_run_vector_matrix_multiplication)
+  {
+    _tmp_a.allocator()->allocate();
+    if (!_reshape_b_only_on_first_run)
+    {
+      _tmp_b.allocator()->allocate();
+    }
+  }
+
+  if (!_fused_assembly_path)
+  {
+    if (_a_offset != 0 && !_reshape_b_only_on_first_run)
+    {
+      _vector_sum_col.allocator()->allocate();
+    }
+
+    if (_b_offset != 0)
+    {
+      _vector_sum_row.allocator()->allocate();
+    }
+  }
+
+  if (_fuse_output_stage)
+  {
+    _mm_result_s32.allocator()->allocate();
+  }
+}
+
+Status NEGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b,
+                                                const ITensorInfo *c, const ITensorInfo *output,
+                                                const GEMMInfo &gemm_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,
+      "Bias addition not supported in NEGEMMLowpMatrixMultiplyCoreEx for output S32");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
+                                  "The product AB is defined only if the number of columns in A is "
+                                  "equal to the number of rows in B");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(),
+                                  "Matrix A already reshaped is not supported");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(),
+                                  "Matrix B already reshaped is not supported");
+
+  GEMMInfo info = gemm_info;
+  const ITensorInfo *matrix_a_info = a;
+  const ITensorInfo *matrix_b_info = b;
+
+  const ITensorInfo *a_to_use = a;
+
+  TensorInfo tmp_a_info{};
+  TensorInfo tmp_b_info{};
+  TensorInfo mm_result_s32_info{};
+
+  int32_t a_offset = a->quantization_info().offset;
+  int32_t b_offset = b->quantization_info().offset;
+
+  bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+  if (fuse_output_stage)
+  {
+    auto_init_if_empty(
+        mm_result_s32_info,
+        a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+  }
+
+  // Check if we need to run the optimized assembly kernel
+  bool run_optimised = false;
+  bool run_optimised_requantized = false;
+  const bool reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
+  if (a_to_use->data_type() == DataType::QASYMM8 &&
+      info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+  {
+    run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, output, 1.f, 0.f,
+                                                          reshape_b_only_on_first_run));
+    run_optimised_requantized = run_optimised;
+  }
+  else
+  {
+    run_optimised = bool(NEGEMMAssemblyDispatch::validate(
+        a_to_use, b, fuse_output_stage ? &mm_result_s32_info : output, 1.f, 0.f,
+        reshape_b_only_on_first_run));
+  }
+
+  if (run_optimised)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
+    if (info.depth_output_gemm3d() != 0)
+    {
+      if (info.reinterpret_input_as_3d())
+      {
+        ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+        ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
+      }
+      else
+      {
+        ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
+      }
+    }
+    else
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+    }
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),
+                                    "NEGEMM cannot reinterpret the input tensor as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,
+                                    "NEGEMM cannot reinterpret the output tensor as 3D");
+
+    const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
+    if (!run_vector_matrix_multiplication)
+    {
+      matrix_a_info = &tmp_a_info;
+      matrix_b_info = &tmp_b_info;
+
+      // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
+      // 4.0f) ]
+      TensorShape shape_tmp_a = a->tensor_shape();
+      shape_tmp_a.set(0, a->dimension(0) * 4);
+      shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+      // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width
+      // / 16.0f) ]
+      TensorShape shape_tmp_b = b->tensor_shape();
+      shape_tmp_b.set(0, b->dimension(1) * 16);
+      shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+      // Validate interleave kernel
+      auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
+      auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
+
+      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
+      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
+    }
+  }
+
+  if (!run_optimised_requantized)
+  {
+    TensorInfo info_vector_sum_col{};
+    TensorInfo info_vector_sum_row{};
+
+    // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+    if (a_offset != 0)
+    {
+      info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+
+      // Configure Matrix B reduction kernel
+      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(
+          b, &info_vector_sum_col, a->dimension(0), false));
+    }
+
+    // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+    if (b_offset != 0)
+    {
+      info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
+
+      // Configure matrix A reduction kernel
+      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(
+          a_to_use, &info_vector_sum_row, a->dimension(0), false));
+    }
+
+    if (fuse_output_stage)
+    {
+      if (!run_optimised)
+      {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(
+            matrix_a_info, matrix_b_info, &mm_result_s32_info));
+      }
+
+      // Validate offset contribution kernel
+      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(
+          &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+          b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset,
+          info.gemmlowp_output_stage()));
+    }
+    else
+    {
+      if (!run_optimised)
+      {
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+      }
+      // Validate offset contribution kernel
+      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(
+          output, a_offset == 0 ? nullptr : &info_vector_sum_col,
+          b_offset == 0 ? nullptr : &info_vector_sum_row, a_offset, b_offset));
+    }
+  }
+  return Status{};
+}
+
+void NEGEMMLowpMatrixMultiplyCoreEx::run()
+{
+  prepare();
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Reshape inputs
+  if (_mtx_a_reshape_kernel)
+  {
+    NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+  }
+  if (_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)
+  {
+    NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+  }
+
+  // Run GEMM
+  if (_asm_glue.is_configured())
+  {
+    _asm_glue.run();
+  }
+  else
+  {
+    NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+  }
+
+  if (!_fused_assembly_path)
+  {
+    // Run matrix A reduction kernel only if _b_offset is not equal to 0
+    if (_b_offset != 0)
+    {
+      NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
+    }
+
+    // Run matrix B reduction kernel only if _a_offset is not equal to 0
+    if (_a_offset != 0 && !_reshape_b_only_on_first_run)
+    {
+      NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+    }
+
+    if (_fuse_output_stage)
+    {
+      // Run offset contribution kernel
+      NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
+    }
+    else
+    {
+      // Run offset contribution kernel
+      NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+    }
+  }
+}
+
+void NEGEMMLowpMatrixMultiplyCoreEx::prepare()
+{
+  if (!_is_prepared)
+  {
+    // Run assembly reshape
+    if (_asm_glue.is_configured() && _reshape_b_only_on_first_run)
+    {
+      ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+      _asm_glue.prepare();
+      _original_b->mark_as_unused();
+    }
+    // Run non-assembly reshape
+    else if (_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)
+    {
+      ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+      // Run reshape kernel and mark original weights tensor as unused
+      _tmp_b.allocator()->allocate();
+      NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+      _original_b->mark_as_unused();
+    }
+
+    // Run matrix B reduction kernel only if _a_offset is not equal to 0
+    if (_a_offset != 0 && _reshape_b_only_on_first_run)
+    {
+      _vector_sum_col.allocator()->allocate();
+      NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+    }
+
+    _is_prepared = true;
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
new file mode 100644
index 000000000..90dabb35a
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGatherEx.h"
+
+#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEGatherKernelEx>();
+  k->configure(input, indices, output, axis);
+  _kernel = std::move(k);
+}
+
+Status NEGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
+                            const ITensorInfo *output, int axis)
+{
+  return NEGatherKernelEx::validate(input, indices, output, axis);
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
new file mode 100644
index 000000000..624185d2c
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h"
+
+#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input,
+                                  ITensor *output, ITensor *hits)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEHashtableLookupKernel>();
+  k->configure(lookups, keys, input, output, hits);
+  _kernel = std::move(k);
+}
+
+Status NEHashtableLookup::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+                                   const ITensorInfo *input, const ITensorInfo *output,
+                                   const ITensorInfo *hits)
+{
+  return NEHashtableLookupKernel::validate(lookups, keys, input, output, hits);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
new file mode 100644
index 000000000..1c2c8f027
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEInstanceNormalizationLayerEx::NEInstanceNormalizationLayerEx(
+    std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false),
+      _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
+{
+}
+
+void NEInstanceNormalizationLayerEx::configure(ITensor *input, ITensor *output, ITensor *gamma,
+                                               ITensor *beta, float epsilon)
+{
+  const DataLayout data_layout = input->info()->data_layout();
+
+  // Configure Kernels
+  _is_nchw = data_layout == DataLayout::NCHW;
+
+  if (!_is_nchw)
+  {
+    _memory_group.manage(&_permuted_input);
+    _memory_group.manage(&_permuted_output);
+
+    // Configure the function to transform the input tensor from NHWC -> NCHW
+    _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+    _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+    _normalization_kernel.configure(&_permuted_input, &_permuted_output, gamma, beta, epsilon);
+    _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+
+    _permute_output.configure(&_permuted_output, output != nullptr ? output : input,
+                              PermutationVector(2U, 0U, 1U));
+    _permuted_input.allocator()->allocate();
+    _permuted_output.allocator()->allocate();
+  }
+  else
+  {
+    _normalization_kernel.configure(input, output, gamma, beta, epsilon);
+  }
+}
+
+Status NEInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                                const ITensorInfo *gamma, const ITensorInfo *beta,
+                                                float epsilon)
+{
+  return NEInstanceNormalizationLayerKernelEx::validate(
+      &input->clone()->set_data_layout(DataLayout::NCHW),
+      &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon);
+}
+
+void NEInstanceNormalizationLayerEx::run()
+{
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Permute input
+  if (!_is_nchw)
+  {
+    _permute_input.run();
+  }
+
+  NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ);
+
+  // Permute output
+  if (!_is_nchw)
+  {
+    _permute_output.run();
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
new file mode 100644
index 000000000..1150cef76
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPReLU.h"
+
+#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEPReLU::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEPReLUKernel>();
+  k->configure(input, alpha, output);
+  _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
new file mode 100644
index 000000000..84411c266
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NERNNLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NERNNLayerEx::NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
+      _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
+      _gemm_output(), _add_output(), _is_prepared(false)
+{
+}
+
+Status NERNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                              const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+                              const ITensorInfo *hidden_state, const ITensorInfo *output,
+                              const ActivationLayerInfo &info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
+                                      output);
+
+  const int idx_width = 0;
+  const int idx_height = 1;
+  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
+                              recurrent_weights->dimension(idx_width));
+  ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
+                              recurrent_weights->dimension(idx_height));
+  ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+                                                     hidden_state->tensor_shape());
+
+  auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(
+                                   recurrent_weights, hidden_state->dimension(idx_height)),
+                               1, input->data_type());
+
+  ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
+  ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(
+      &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+  ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info));
+
+  return Status{};
+}
+
+void NERNNLayerEx::configure(const ITensor *input, const ITensor *weights,
+                             const ITensor *recurrent_weights, const ITensor *bias,
+                             ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+  ARM_COMPUTE_ERROR_THROW_ON(NERNNLayerEx::validate(input->info(), weights->info(),
+                                                    recurrent_weights->info(), bias->info(),
+                                                    hidden_state->info(), output->info(), info));
+
+  const int idx_height = 1;
+  TensorShape shape = misc::shape_calculator::compute_rnn_shape(
+      recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+
+  _is_prepared = false;
+
+  // Manage intermediate buffers and configure
+  _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+  _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+
+  // Manage intermediate buffers and configure
+  _memory_group.manage(&_fully_connected_out);
+  _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+
+  _memory_group.manage(&_gemm_output);
+  _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+
+  _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+  _memory_group.manage(&_add_output);
+
+  _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output,
+                        ConvertPolicy::SATURATE);
+
+  _fully_connected_out.allocator()->allocate();
+  _gemm_output.allocator()->allocate();
+
+  _activation_kernel.configure(&_add_output, hidden_state, info);
+  _add_output.allocator()->allocate();
+
+  _copy_kernel.configure(hidden_state, output);
+}
+
+void NERNNLayerEx::run()
+{
+  prepare();
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  _fully_connected_kernel.run();
+
+  _gemm_state_f.run();
+
+  NEScheduler::get().schedule(&_add_kernel, Window::DimY);
+  NEScheduler::get().schedule(&_activation_kernel, Window::DimY);
+
+  // copy hidden out to output
+  NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+}
+
+void NERNNLayerEx::prepare()
+{
+  if (!_is_prepared)
+  {
+    _fully_connected_kernel.prepare();
+    _gemm_state_f.prepare();
+
+    _is_prepared = true;
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
new file mode 100644
index 000000000..c65e93570
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReduceMeanEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEReduceMeanEx::NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+      _reduction_ops(), _keep_dims()
+{
+}
+
+Status NEReduceMeanEx::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                                bool keep_dims, const ITensorInfo *output)
+{
+  ARM_COMPUTE_UNUSED(keep_dims);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+  ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+  TensorShape out_shape = input->tensor_shape();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+  const int input_dims = input->num_dimensions();
+  Coordinates axis_local = reduction_axis;
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+    ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
+                                input->num_dimensions() - 1);
+    if (output->total_size() > 0 && keep_dims)
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+    }
+    if (keep_dims)
+    {
+      out_shape.set(axis_local[i], 1);
+    }
+    else
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+  }
+  const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+  return Status{};
+}
+
+void NEReduceMeanEx::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+                               ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  _reduction_ops = reduction_axis.num_dimensions();
+  _reduction_kernels =
+      arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops);
+  _reduced_outs =
+      arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
+  _keep_dims = keep_dims;
+
+  Coordinates axis_local = reduction_axis;
+  const int input_dims = input->info()->num_dimensions();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  // Perform reduction for every axis
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    TensorShape out_shape = i == 0 ? input->info()->tensor_shape()
+                                   : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+    out_shape.set(axis_local[i], 1);
+    auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+
+    if (i == _reduction_ops - 1 && keep_dims)
+    {
+      _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
+    }
+    else
+    {
+      _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
+                                                    input->info()->data_type(),
+                                                    input->info()->quantization_info())
+                                             .set_data_layout(output->info()->data_layout()));
+      _memory_group.manage(_reduced_outs.get() + i);
+      _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i],
+                                      ReductionOperation::MEAN_SUM);
+    }
+  }
+
+  // Allocate intermediate tensors
+  for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+  {
+    _reduced_outs[i].allocator()->allocate();
+  }
+
+  // Configure reshape layer if we want to drop the dimensions
+  if (!keep_dims)
+  {
+    TensorShape out_shape = input->info()->tensor_shape();
+
+    // We have to sort the reduction axis vectors in order for remove_dimension
+    // to work properly
+    std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+    for (unsigned int i = 0; i < _reduction_ops; ++i)
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+    _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+  }
+}
+
+void NEReduceMeanEx::run()
+{
+  _memory_group.acquire();
+
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    _reduction_kernels[i].run();
+  }
+
+  if (!_keep_dims)
+  {
+    _reshape.run();
+  }
+  _memory_group.release();
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
new file mode 100644
index 000000000..b36f8287a
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReduceOperation.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEReduceOperation::NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+      _reduction_ops(), _keep_dims()
+{
+}
+
+Status NEReduceOperation::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                                   bool keep_dims, const ITensorInfo *output, ReduceOperation op)
+{
+  ARM_COMPUTE_UNUSED(keep_dims);
+  ARM_COMPUTE_UNUSED(op);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+  TensorShape out_shape = input->tensor_shape();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+  const int input_dims = input->num_dimensions();
+  Coordinates axis_local = reduction_axis;
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+    ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
+                                input->num_dimensions() - 1);
+    if (output->total_size() > 0 && keep_dims)
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+    }
+    if (keep_dims)
+    {
+      out_shape.set(axis_local[i], 1);
+    }
+    else
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+  }
+  const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+  return Status{};
+}
+
+void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+                                  ITensor *output, ReduceOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  _reduction_ops = reduction_axis.num_dimensions();
+  _reduction_kernels.resize(_reduction_ops);
+  _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+  _keep_dims = keep_dims;
+
+  Coordinates axis_local = reduction_axis;
+  const int input_dims = input->info()->num_dimensions();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  // Perform reduction for every axis
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    TensorShape out_shape =
+        i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+    out_shape.set(axis_local[i], 1);
+    auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
+
+    if (i == _reduction_ops - 1 && keep_dims)
+    {
+      _reduction_kernels[i].configure(in, output, axis_local[i], op);
+    }
+    else
+    {
+      _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
+                                                    input->info()->data_type(),
+                                                    input->info()->quantization_info()));
+      _memory_group.manage(&_reduced_outs[i]);
+      _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], op);
+    }
+  }
+
+  // Allocate intermediate tensors
+  for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+  {
+    _reduced_outs[i].allocator()->allocate();
+  }
+
+  // Configure reshape layer if we want to drop the dimensions
+  if (!keep_dims)
+  {
+    TensorShape out_shape = input->info()->tensor_shape();
+
+    // We have to sort the reduction axis vectors in order for remove_dimension
+    // to work properly
+    std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+    for (unsigned int i = 0; i < _reduction_ops; ++i)
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+    _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
+  }
+}
+
+void NEReduceOperation::run()
+{
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    _reduction_kernels[i].run();
+  }
+
+  if (!_keep_dims)
+  {
+    _reshape.run();
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
new file mode 100644
index 000000000..3c18217ef
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReduceSum.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEReduceSum::NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+      _reduction_ops(), _keep_dims()
+{
+}
+
+Status NEReduceSum::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+                             bool keep_dims, const ITensorInfo *output)
+{
+  ARM_COMPUTE_UNUSED(keep_dims);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+  TensorShape out_shape = input->tensor_shape();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+  const int input_dims = input->num_dimensions();
+  Coordinates axis_local = reduction_axis;
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+    ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
+                                input->num_dimensions() - 1);
+    if (output->total_size() > 0 && keep_dims)
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+    }
+    if (keep_dims)
+    {
+      out_shape.set(axis_local[i], 1);
+    }
+    else
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+  }
+  const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+  return Status{};
+}
+
+void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+                            ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+  _reduction_ops = reduction_axis.num_dimensions();
+  _reduction_kernels.resize(_reduction_ops);
+  _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+  _keep_dims = keep_dims;
+
+  Coordinates axis_local = reduction_axis;
+  const int input_dims = input->info()->num_dimensions();
+  const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+  // Convert negative axis
+  for (unsigned int i = 0; i < reduction_ops; ++i)
+  {
+    axis_local[i] = wrap_around(axis_local[i], input_dims);
+  }
+
+  // Perform reduction for every axis
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    TensorShape out_shape =
+        i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+    out_shape.set(axis_local[i], 1);
+    auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
+
+    if (i == _reduction_ops - 1 && keep_dims)
+    {
+      _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::SUM);
+    }
+    else
+    {
+      _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
+                                                    input->info()->data_type(),
+                                                    input->info()->quantization_info())
+                                             .set_data_layout(input->info()->data_layout()));
+      _memory_group.manage(&_reduced_outs[i]);
+      _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i],
+                                      ReductionOperation::SUM);
+    }
+  }
+
+  // Allocate intermediate tensors
+  for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+  {
+    _reduced_outs[i].allocator()->allocate();
+  }
+
+  // Configure reshape layer if we want to drop the dimensions
+  if (!keep_dims)
+  {
+    TensorShape out_shape = input->info()->tensor_shape();
+
+    // We have to sort the reduction axis vectors in order for remove_dimension
+    // to work properly
+    std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+    for (unsigned int i = 0; i < _reduction_ops; ++i)
+    {
+      out_shape.remove_dimension(axis_local[i] - i);
+    }
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+    _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
+  }
+}
+
+void NEReduceSum::run()
+{
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  for (unsigned int i = 0; i < _reduction_ops; ++i)
+  {
+    _reduction_kernels[i].run();
+  }
+
+  if (!_keep_dims)
+  {
+    _reshape.run();
+  }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp
new file mode 100644
index 000000000..c3431c418
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+namespace
+{
+/** Define dimension to split the window
+ *
+ * @param[in] axis Reduction axis
+ *
+ * @return The dimension to split the window
+ */
+size_t reduction_window_split_dimension(unsigned int axis)
+{
+  switch (axis)
+  {
+    case 0:
+      return Window::DimY;
+    case 1:
+    case 2:
+    case 3:
+      return Window::DimX;
+    default:
+      ARM_COMPUTE_ERROR("Unsupported reduction axis");
+  }
+}
+} // namespace
+
+NEReductionOperationEx::NEReductionOperationEx()
+    : _reduction_kernel(), _fill_border_kernel(), _window_split(0), _reduction_axis()
+{
+}
+
+Status NEReductionOperationEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                        unsigned int axis, ReduceOperation op)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernelEx::validate(input, output, axis, op));
+
+  return Status{};
+}
+
+void NEReductionOperationEx::configure(ITensor *input, ITensor *output, unsigned int axis,
+                                       ReduceOperation op)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      NEReductionOperationEx::validate(input->info(), output->info(), axis, op));
+
+  // Configure reduction kernel
+  _reduction_kernel.configure(input, output, axis, op);
+  _window_split = reduction_window_split_dimension(axis);
+  _reduction_axis = axis;
+
+  if (axis == 0)
+  {
+    // Configure fill border kernel
+    const BorderSize fill_border_size = _reduction_kernel.border_size();
+    PixelValue pixelValue;
+    switch (op)
+    {
+      case ReduceOperation::MIN:
+      {
+        switch (input->info()->data_type())
+        {
+          case DataType::F32:
+          {
+            pixelValue = PixelValue(std::numeric_limits<float>::max());
+            break;
+          }
+          case DataType::F16:
+          {
+            pixelValue = PixelValue(static_cast<half>(65504.0f));
+            break;
+          }
+          case DataType::QASYMM8:
+          {
+            pixelValue =
+                PixelValue(255, input->info()->data_type(), input->info()->quantization_info());
+            break;
+          }
+          default:
+          {
+            ARM_COMPUTE_ERROR("Unsupported DataType");
+          }
+        }
+        break;
+      }
+      case ReduceOperation::MAX:
+      {
+        switch (input->info()->data_type())
+        {
+          case DataType::F32:
+          {
+            pixelValue = PixelValue(-std::numeric_limits<float>::max());
+            break;
+          }
+          case DataType::F16:
+          {
+            pixelValue = PixelValue(static_cast<half>(-65504.0f));
+            break;
+          }
+          case DataType::QASYMM8:
+          {
+            pixelValue =
+                PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
+            break;
+          }
+          default:
+          {
+            ARM_COMPUTE_ERROR("Unsupported DataType");
+          }
+        }
+        break;
+      }
+      default:
+        ARM_COMPUTE_ERROR("Reduction Operation unsupported");
+    }
+    _fill_border_kernel.configure(input, fill_border_size, BorderMode::CONSTANT, pixelValue);
+  }
+}
+
+void NEReductionOperationEx::run()
+{
+  if (_reduction_axis == 0)
+  {
+    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+  }
+  NEScheduler::get().schedule(&_reduction_kernel, _window_split);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
new file mode 100644
index 000000000..c9f914fb0
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NESpaceToBatchLayerEx::NESpaceToBatchLayerEx()
+    : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
+{
+}
+
+void NESpaceToBatchLayerEx::configure(const ITensor *input, const ITensor *block_shape,
+                                      const ITensor *paddings, ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
+
+  if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+  {
+    _has_padding = true;
+    _memset_kernel.configure(
+        output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
+  }
+  _space_to_batch_kernel.configure(input, block_shape, paddings, output);
+}
+
+void NESpaceToBatchLayerEx::configure(const ITensor *input, const int block_shape_x,
+                                      const int block_shape_y, const Size2D &padding_left,
+                                      const Size2D &padding_right, ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+  if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+  {
+    _has_padding = true;
+    _memset_kernel.configure(
+        output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
+  }
+  _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right,
+                                   output);
+}
+
+Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const ITensorInfo *block_shape,
+                                       const ITensorInfo *paddings, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(
+      NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
+
+  return Status{};
+}
+
+Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const int block_shape_x,
+                                       const int block_shape_y, const Size2D &padding_left,
+                                       const Size2D &padding_right, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(
+      input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+
+  return Status{};
+}
+
+void NESpaceToBatchLayerEx::run()
+{
+  // Zero out output only if we have paddings
+  if (_has_padding)
+  {
+    NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+  }
+  NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
new file mode 100644
index 000000000..b6ae21cc0
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+void NESpaceToDepthLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NESpaceToDepthLayerKernelEx>();
+  k->configure(input, output, block_shape);
+  _kernel = std::move(k);
+}
+
+Status NESpaceToDepthLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                       int32_t block_shape)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToDepthLayerKernelEx::validate(input, output, block_shape));
+  return Status{};
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
new file mode 100644
index 000000000..fd15ef05f
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/UtilsEx.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _conv_f(),
+      _upsample_f(),
+      _flip_weights(),
+      _permute_input(),
+      _permute_weights(),
+      _permute_output(),
+      _scaled_output(),
+      _weights_flipped(),
+      _permuted_input(),
+      _permuted_weights(),
+      _permuted_output(),
+      _is_nchw(false),
+      _original_weights(nullptr),
+      _input(nullptr),
+      _info(),
+      _is_prepared(false)
+{
+}
+
+Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                      const ITensorInfo *bias, const ITensorInfo *output,
+                                      const PadStrideInfo &info, unsigned int invalid_right,
+                                      unsigned int invalid_bottom)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16,
+                                                       DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
+  const unsigned int width_idx =
+      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+  const unsigned int height_idx =
+      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1);
+
+  auto out_dims = transposeconv_output_dimensions(
+      input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx),
+      weights->dimension(height_idx), info, invalid_right, invalid_bottom);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+  if (is_data_type_quantized_asymmetric(input->data_type()) && bias)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+  }
+  else if (bias)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+  }
+
+  if (output->tensor_shape().total_size() > 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) < output_shape.x(),
+                                    "Output's dim 0 is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) < output_shape.y(),
+                                    "Output's dim 1 is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) < output_shape.z(),
+                                    "Output's dim 2 is invalid.");
+  }
+
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
+      pad_bottom);
+  TensorInfo scale_out_info(
+      input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
+  scale_out_info.set_data_layout(input->data_layout());
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+  const unsigned int batches_idx =
+      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
+  const unsigned int channel_idx =
+      get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) !=
+                              scale_out_info.dimension(batches_idx));
+  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) !=
+                              scale_out_info.dimension(channel_idx));
+
+  ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output,
+                                                           conv_info, WeightsInfo()));
+
+  return Status{};
+}
+
+void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias,
+                                     ITensor *output, const PadStrideInfo &info,
+                                     unsigned int invalid_right, unsigned int invalid_bottom)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  const DataLayout data_layout = input->info()->data_layout();
+
+  _input = input;
+  _original_weights = weights;
+  _info = info;
+  _is_prepared = false;
+  _is_nchw = data_layout == DataLayout::NCHW;
+
+  const unsigned int stride_x = info.stride().first;
+  const unsigned int stride_y = info.stride().second;
+
+  const unsigned int width_idx =
+      get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const unsigned int height_idx =
+      get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  auto out_dims = transposeconv_output_dimensions(
+      input->info()->dimension(width_idx), input->info()->dimension(height_idx),
+      weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info,
+      invalid_right, invalid_bottom);
+
+  const TensorShape output_shape =
+      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                     input->info()->quantization_info());
+
+  // Perform validation step
+  ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
+      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
+      info, invalid_right, invalid_bottom));
+
+  _memory_group.manage(&_scaled_output);
+
+  if (!_is_nchw)
+  {
+    _memory_group.manage(&_permuted_input);
+    _memory_group.manage(&_permuted_weights);
+    _memory_group.manage(&_permuted_output);
+
+    // Configure the function to transform the input tensor from NHWC -> NCHW
+    _permuted_input.info()->set_quantization_info(input->info()->quantization_info());
+    _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+    _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+    // Configure the function to transform the weights tensor from NHWC -> NCHW
+    _permuted_weights.info()->set_quantization_info(weights->info()->quantization_info());
+    _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+    _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
+    // order to match output shape
+
+    unsigned int pad_left = 0;
+    unsigned int pad_right = 0;
+    unsigned int pad_top = 0;
+    unsigned int pad_bottom = 0;
+    const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+        *_permuted_input.info(), *_permuted_weights.info(), info, out_dims, invalid_right,
+        invalid_bottom, pad_left, pad_right, pad_top, pad_bottom);
+
+    TensorInfo scale_out_info(scale_out_shape, 1, _permuted_input.info()->data_type(),
+                              _permuted_input.info()->quantization_info());
+    scale_out_info.set_data_layout(DataLayout::NCHW);
+    _scaled_output.allocator()->init(scale_out_info);
+
+    const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+                                      DimensionRoundingType::CEIL);
+    _upsample_f.configure(&_permuted_input, &_scaled_output, upsample_info);
+
+    _weights_flipped.allocator()->init(*_permuted_weights.info()->clone());
+    _weights_flipped.info()->set_quantization_info(weights->info()->quantization_info());
+    _flip_weights.configure(&_permuted_weights, &_weights_flipped);
+
+    // setup the function to convolve the upscaled output
+    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+    const auto out_shape = output->info()->tensor_shape();
+    TensorShape permuted_out_shape{out_shape[1], out_shape[2], out_shape[0], out_shape[3]};
+    TensorInfo permuted_out_info(permuted_out_shape, 1, output->info()->data_type(),
+                                 output->info()->quantization_info());
+    _permuted_output.allocator()->init(permuted_out_info);
+    _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, &_permuted_output, conv_info);
+
+    // Configure the function to transform the convoluted output to NHWC
+    _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+    _permuted_input.allocator()->allocate();
+    _permuted_weights.allocator()->allocate();
+    _permuted_output.allocator()->allocate();
+  }
+  else
+  {
+    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
+    // order to match output shape
+    unsigned int pad_left = 0;
+    unsigned int pad_right = 0;
+    unsigned int pad_top = 0;
+    unsigned int pad_bottom = 0;
+    const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+        *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+        pad_right, pad_top, pad_bottom);
+
+    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+                              input->info()->quantization_info());
+    _scaled_output.allocator()->init(scale_out_info);
+    const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+                                      DimensionRoundingType::FLOOR);
+    _upsample_f.configure(input, &_scaled_output, upsample_info);
+
+    _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+    _flip_weights.configure(weights, &_weights_flipped);
+
+    // setup the function to convolve the upscaled output
+    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
+  }
+  _scaled_output.allocator()->allocate();
+}
+
+void NETransposeConvLayer::run()
+{
+  prepare();
+
+  // MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Permute input
+  if (!_is_nchw)
+  {
+    _permute_input.run();
+  }
+
+  _upsample_f.run();
+  _conv_f.run();
+
+  // Permute output
+  if (!_is_nchw)
+  {
+    _permute_output.run();
+  }
+}
+
+void NETransposeConvLayer::prepare()
+{
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    // Run weights flipping and mark original weights tensor as unused
+    _weights_flipped.allocator()->allocate();
+    // Permute weights
+    if (!_is_nchw)
+    {
+      _permute_weights.run();
+    }
+    NEScheduler::get().schedule(&_flip_weights, Window::DimZ);
+    _original_weights->mark_as_unused();
+
+    // Prepare convolution
+    _conv_f.prepare();
+
+    if (!_weights_flipped.is_used())
+    {
+      _weights_flipped.allocator()->free();
+    }
+
+    _is_prepared = true;
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp
new file mode 100644
index 000000000..67e1bfb02
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/misc/functions/GenericGather.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+
+bool shouldPermute(arm_compute::ITensorInfo *input, arm_compute::ITensorInfo *output)
+{
+  return (input->num_dimensions() != 4 && output->num_dimensions() == 4 &&
+          input->data_layout() == DataLayout::NCHW);
+}
+
+void GenericGather::configure(arm_compute::ITensor *input, arm_compute::ITensor *indices,
+                              arm_compute::ITensor *output, int axis)
+{
+  _input = input;
+  _indices = indices;
+  _output = output;
+  _axis = axis;
+
+  arm_compute::PermutationVector pv;
+  if (shouldPermute(input->info(), output->info()))
+  {
+    // NOTE This vector comes from CLPermuteKernel implementation
+    //
+    // This implementation permutes a tensor of shape C / W / H into another tensor of shape W / H /
+    // C
+    //
+    //     Original | Permuted
+    // 0 | C        | W (from 1)
+    // 1 | W        | H (from 2)
+    // 2 | H        | C (from 0)
+    //
+    pv = arm_compute::PermutationVector{1, 2, 0};
+  }
+
+  if (utils::isGpuMode())
+  {
+    if (shouldPermute(input->info(), output->info()))
+    {
+      _cl_gather.configure(CAST_CL(input), CAST_CL(indices), &_cl_permuted, axis);
+      _cl_permute.configure(&_cl_permuted, CAST_CL(output), pv);
+
+      // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+      _cl_permuted.allocator()->allocate();
+    }
+    else
+    {
+      _cl_gather.configure(CAST_CL(input), CAST_CL(indices), CAST_CL(output), axis);
+    }
+  }
+  else
+  {
+    throw std::runtime_error("Not supported, yet");
+  }
+}
+
+void GenericGather::run(void)
+{
+  if (utils::isGpuMode())
+  {
+    _cl_gather.run();
+    if (shouldPermute(_input->info(), _output->info()))
+    {
+      _cl_permute.run();
+    }
+  }
+  else
+  {
+    throw std::runtime_error("Not supported, yet");
+  }
+}
+
+} // namespace misc
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp
new file mode 100644
index 000000000..8025ae28e
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/runtime/misc/functions/GenericReshapeLayer.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+
+namespace
+{
+
+bool shouldPermute(const arm_compute::ITensorInfo *input, arm_compute::ITensorInfo *output)
+{
+  return (input->num_dimensions() == 4 || output->num_dimensions() == 4) &&
+         (input->num_dimensions() != output->num_dimensions() &&
+          input->data_layout() == DataLayout::NCHW);
+}
+
+} // namespace
+
+void GenericReshapeLayer::configure(const arm_compute::ITensor *input, arm_compute::ITensor *output)
+{
+  _input = input;
+  _output = output;
+
+  arm_compute::PermutationVector pv;
+  if (input->info()->data_layout() == DataLayout::NCHW && input->info()->num_dimensions() == 4 &&
+      output->info()->num_dimensions() != 4)
+  {
+    // NOTE This vector comes from CLPermuteKernel implementation
+    //
+    // This implementation permutes a tensor of shape W / H / C into another tensor of shape
+    // C / W / H
+    //
+    //     Original | Permuted
+    // 0 | W        | C (from 2)
+    // 1 | H        | W (from 0)
+    // 2 | C        | H (from 1)
+    //
+    pv = arm_compute::PermutationVector{2, 0, 1};
+  }
+  else if (input->info()->data_layout() == DataLayout::NCHW &&
+           input->info()->num_dimensions() != 4 && output->info()->num_dimensions() == 4)
+  {
+    // NOTE This vector comes from CLPermuteKernel implementation
+    //
+    // This implementation permutes a tensor of shape C / W / H into another tensor of shape
+    // W / H / C
+    //
+    //     Original | Permuted
+    // 0 | C        | W (from 1)
+    // 1 | W        | H (from 2)
+    // 2 | H        | C (from 0)
+    //
+    pv = arm_compute::PermutationVector{1, 2, 0};
+  }
+
+  if (utils::isGpuMode())
+  {
+    const auto const_input = CAST_CL(const_cast<arm_compute::ITensor *>(input));
+    if (shouldPermute(input->info(), output->info()))
+    {
+      _cl_permute.configure(const_input, &_cl_permuted, pv);
+      _cl_reshape.configure(&_cl_permuted, CAST_CL(output));
+
+      // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+      _cl_permuted.allocator()->allocate();
+    }
+    else
+    {
+      _cl_reshape.configure(const_input, CAST_CL(output));
+    }
+  }
+  else
+  {
+    if (shouldPermute(input->info(), output->info()))
+    {
+      _neon_permute.configure(input, &_neon_permuted, pv);
+      _neon_reshape.configure(&_neon_permuted, output);
+
+      // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+      _neon_permuted.allocator()->allocate();
+    }
+    else
+    {
+      _neon_reshape.configure(input, output);
+    }
+  }
+}
+
+void GenericReshapeLayer::run(void)
+{
+  if (utils::isGpuMode())
+  {
+    if (shouldPermute(_input->info(), _output->info()))
+    {
+      _cl_permute.run();
+    }
+    _cl_reshape.run();
+  }
+  else
+  {
+    if (shouldPermute(_input->info(), _output->info()))
+    {
+      _neon_permute.run();
+    }
+    _neon_reshape.run();
+  }
+}
+
+} // namespace misc
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp
new file mode 100644
index 000000000..44a4bb9ed
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/misc/functions/Utils.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+namespace utils
+{
+
+bool isGpuMode()
+{
+  char *neon = std::getenv("NEON");
+  if (neon == nullptr)
+    return true;
+  else if (neon[0] == '1')
+    return false;
+  return true;
+}
+
+} // namespace utils
+} // namespace misc
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/topk_v2.h b/compute/ARMComputeEx/src/runtime/topk_v2.h
new file mode 100644
index 000000000..f94effea1
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/topk_v2.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file topk_v2.h
+ * @brief This file contains TopK method and TopContainer class for TopK operation
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
+#define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__
+
+typedef int32_t int32;
+
+namespace nnfw
+{
+namespace rt
+{
+namespace optimized_ops
+{
+/**
+ * @brief class to define TopK operation
+ * @note The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file.
+ * TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than
+ * TFLite.
+ * (TFLite additionaly supports kTfLiteInt64.)
+ *
+ * The class that collects top indexes of k values. Based on template
+ * tensorflow::gtl::TopN<> but, for optimization,
+ * it re-uses the same container.
+ */
+template <typename T> class TopContainer
+{
+public:
+  /**
+   * @brief Prevent default constructor of of this class
+   */
+  TopContainer() = delete;
+  /**
+   * @brief Constructor with params
+   * @param [in] row_size Size of row in data
+   * @param [in] k The top k predictions
+   */
+  TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr)
+  {
+    container_.reserve(std::min(k, row_size) + 1);
+  }
+
+  /**
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * @param [in] topContainer To copy
+   */
+  TopContainer(const TopContainer &) = delete;
+  /*
+   * @brief Prevent instances of this class from being copied (As this class contains pointers)
+   * @param [in] topContainer To copy
+   * @return Reference of TopContainer
+   */
+  TopContainer &operator=(const TopContainer &) = delete;
+
+  /**
+   * @brief Start collecting
+   * @param [in] values To set as values
+   * @return N/A
+   */
+  void start_collecting(const T *values)
+  {
+    values_ = values;
+    container_.clear();
+  }
+
+  /**
+   * @brief Push a value to be compared for topk
+   * @param [in] a A value to compare
+   * @return N/A
+   */
+  void push(int32 a)
+  {
+    auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
+    if (container_.size() <= (size_t)k_)
+    {
+      container_.push_back(a);
+      if (container_.size() == (size_t)(k_ + 1))
+      {
+        std::make_heap(container_.begin(), container_.end(), comparator);
+        std::pop_heap(container_.begin(), container_.end(), comparator);
+      }
+    }
+    else if (comparator(a, container_.front()))
+    {
+      container_.back() = a;
+      std::push_heap(container_.begin(), container_.end(), comparator);
+      std::pop_heap(container_.begin(), container_.end(), comparator);
+    }
+  }
+
+  /**
+   * @brief Get sorted result from pushed values
+   * @return Reference of vector with sorted values
+   */
+  const std::vector<int32> &sorted_result()
+  {
+    auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); };
+    if (container_.size() <= (size_t)(k_))
+    {
+      std::sort(container_.begin(), container_.end(), comparator);
+    }
+    else
+    {
+      std::sort_heap(container_.begin(), container_.end() - 1, comparator);
+      container_.resize(k_);
+    }
+    return container_;
+  }
+
+private:
+  int32 k_;
+  std::vector<int32> container_;
+  const T *values_ = nullptr;
+
+  bool compare_fun(int32 a, int32 b) const
+  {
+    if (values_[b] < values_[a])
+    {
+      return true;
+    }
+    else if (values_[b] > values_[a])
+    {
+      return false;
+    }
+    else
+    {
+      return a < b;
+    }
+  }
+};
+
+/**
+ * @brief Operates TopK operation with params
+ * @param [in] row_size Size of row in data
+ * @param [in] num_rows The number of rows in data
+ * @param [in] data To be operated in
+ * @param [in] k The top k predictions
+ * @param [out] output_indexes Indexes of targets in the top k predictions
+ * @param [out] output_values Values of targets in the top k predictions
+ * @return N/A
+ */
+template <typename T>
+void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes,
+          T *output_values)
+{
+  TopContainer<T> topc(k, row_size);
+  for (int row = 0; row < num_rows; ++row)
+  {
+    const T *values_row = data + row * row_size;
+    topc.start_collecting(values_row);
+    for (int32 c = 0; c < row_size; ++c)
+    {
+      topc.push(c);
+    }
+
+    // Prepare output buffers.
+    int32 *indexes_row = output_indexes + row * k;
+    T *output_row = output_values + row * k;
+    // We always assume that the output is sorted.
+    const auto &top_k = topc.sorted_result();
+    std::copy(top_k.begin(), top_k.end(), indexes_row);
+    std::transform(top_k.begin(), top_k.end(), output_row,
+                   [values_row](const int32 loc) { return values_row[loc]; });
+  }
+}
+
+} // namespace optimized_ops
+} // namespace rt
+} // namespace nnfw
+
+#endif // __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__