19 files changed, 376 insertions, 993 deletions
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
deleted file mode 100644
index 2d379cf36..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLArgOperation.h"
-
-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-namespace arm_compute
-{
-
-CLArgOperation::CLArgOperation()
-{
-  // DO NOTHING
-}
-
-void CLArgOperation::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis,
-                               ArgOperation op)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), axis, output->info(), op));
-  _input = input;
-  _output = output;
-  _axis = axis;
-  _arg_op = op;
-  // NOTE The argminmax_axis must have no duplication.
-  _num_of_kernels = axis.size();
-  const size_t num_of_interm_tensors = _num_of_kernels - 1;
-
-  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
-  _argop_kernels =
-      arm_compute::support::cpp14::make_unique<CLArgOperationKernel[]>(_num_of_kernels);
-
-  TensorShape shape{input->info()->tensor_shape()};
-  for (size_t i = 0; i < num_of_interm_tensors; i++)
-  {
-    shape.set(_axis[i], 1);
-    _interm_tensors[i].allocator()->init(
-        TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())
-            .set_data_layout(input->info()->data_layout()));
-    _interm_tensors[i].allocator()->allocate();
-  }
-
-  // Set a vector that is ordered ICLTensors sequentially.
-  std::vector<ICLTensor *> tensors;
-  tensors.emplace_back(input);
-  for (size_t i = 0; i < num_of_interm_tensors; i++)
-  {
-    tensors.emplace_back(_interm_tensors.get() + i);
-  }
-  tensors.emplace_back(output);
-
-  // Apply ArgMinMax on all kernels
-  for (size_t i = 0; i < _num_of_kernels; i++)
-  {
-    _argop_kernels[i].configure(tensors[i], tensors[i + 1], _axis[i], op);
-  }
-}
-
-Status CLArgOperation::validate(const ITensorInfo *input, const std::vector<uint32_t> &axis,
-                                const ITensorInfo *output, ArgOperation op)
-{
-  const size_t num_of_kernels = axis.size();
-  const size_t num_of_interm_tensors = num_of_kernels - 1;
-
-  // Create temporary tensor infos
-  auto interm_tensors =
-      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
-
-  // Create intermediate tensor info
-  TensorShape shape{input->tensor_shape()};
-
-  for (size_t i = 0; i < num_of_interm_tensors; i++)
-  {
-    shape.set(axis[i], 1);
-    interm_tensors[i].set_data_type(input->data_type());
-    interm_tensors[i].set_tensor_shape(shape);
-    interm_tensors[i].set_num_channels(input->num_channels());
-  }
-
-  // Set a vector that is ordered ITensorInfo sequentially.
-  std::vector<const ITensorInfo *> tensors;
-  tensors.emplace_back(input);
-  for (size_t i = 0; i < num_of_interm_tensors; i++)
-  {
-    tensors.emplace_back(interm_tensors.get() + i);
-  }
-  tensors.emplace_back(output);
-
-  // Validate argminmax only on all kernels
-  for (size_t i = 0; i < num_of_kernels; i++)
-  {
-    ARM_COMPUTE_RETURN_ON_ERROR(
-        CLArgOperationKernel::validate(tensors[i], tensors[i + 1], axis[i], op));
-  }
-
-  return Status{};
-}
-
-void CLArgOperation::run()
-{
-  for (size_t i = 0; i < _num_of_kernels; ++i)
-  {
-    CLScheduler::get().enqueue(_argop_kernels[i]);
-  }
-}
-
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
index 92ee69a36..e5122ab8f 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
@@ -48,7 +48,7 @@ using namespace arm_compute;
 void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
                                   BinaryLogicalOperation op)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+  auto k = support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
   k->configure(input1, input2, output, op);
   _kernel = std::move(k);
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
deleted file mode 100644
index b3118f39e..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLCast.h"
-
-#include "arm_compute/core/CL/kernels/CLCastKernel.h"
-
-using namespace arm_compute;
-
-void CLCast::configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>();
-  k->configure(input, output, input_subtype);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
deleted file mode 100644
index db662505a..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h"
-
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
-
-using namespace arm_compute;
-
-void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>();
-  k->configure(input, output, block_size);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
new file mode 100644
index 000000000..3dede0562
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/UtilsEx.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <memory>
+#include <tuple>
+
+namespace arm_compute
+{
+using namespace arm_compute::misc::shape_calculator;
+
+CLDirectTransposeConvLayer::CLDirectTransposeConvLayer(
+    std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _scale_f(),
+      _conv_f(),
+      _flip_weights(),
+      _scaled_output(),
+      _original_weights(nullptr),
+      _weights_flipped(),
+      _flip_axis(),
+      _is_prepared(false)
+{
+}
+
+Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                            const ITensorInfo *bias, ITensorInfo *output,
+                                            const PadStrideInfo &info, unsigned int invalid_right,
+                                            unsigned int invalid_bottom,
+                                            const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+  const DataLayout data_layout = input->data_layout();
+
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
+
+  auto out_dims = transposeconv_output_dimensions(
+      input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
+      weights->dimension(idx_h), info, invalid_right, invalid_bottom);
+
+  const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+
+  if (bias != nullptr)
+  {
+    if (is_data_type_quantized_asymmetric(input->data_type()))
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+    }
+    else
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
+                                  "Output's width is invalid.");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
+                                  "Output's height is invalid.");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
+                                  "Output's depth is invalid.");
+
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
+      pad_bottom);
+  TensorInfo scale_out_info(input->clone()
+                                ->set_is_resizable(true)
+                                .reset_padding()
+                                .set_tensor_shape(scale_out_shape)
+                                .set_data_layout(data_layout));
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+  ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
+  ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
+                                                           conv_info, weights_info));
+
+  return Status{};
+}
+
+void CLDirectTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights,
+                                           const ICLTensor *bias, ICLTensor *output,
+                                           const PadStrideInfo &info, unsigned int invalid_right,
+                                           unsigned int invalid_bottom,
+                                           const WeightsInfo &weights_info)
+{
+  configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info,
+            invalid_right, invalid_bottom, weights_info);
+}
+
+void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_context,
+                                           ICLTensor *input, ICLTensor *weights,
+                                           const ICLTensor *bias, ICLTensor *output,
+                                           const PadStrideInfo &info, unsigned int invalid_right,
+                                           unsigned int invalid_bottom,
+                                           const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const unsigned int stride_x = info.stride().first;
+  const unsigned int stride_y = info.stride().second;
+
+  const DataLayout data_layout = input->info()->data_layout();
+
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+  _original_weights = weights;
+  _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+  _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+  _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
+
+  auto out_dims = transposeconv_output_dimensions(
+      input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+      weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
+      invalid_bottom);
+
+  const TensorShape output_shape =
+      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty(
+      *output->info(),
+      input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+
+  // Perform validation step
+  ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate(
+      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
+      info, invalid_right, invalid_bottom));
+
+  _is_prepared = weights_info.retain_internal_weights();
+
+  _memory_group.manage(&_scaled_output);
+
+  // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
+  // to match output shape
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+      pad_right, pad_top, pad_bottom);
+
+  TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+                            input->info()->quantization_info());
+  scale_out_info.set_data_layout(data_layout);
+  _scaled_output.allocator()->init(scale_out_info);
+
+  // configure scale function
+  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+                                    DimensionRoundingType::FLOOR);
+  _scale_f.configure(input, &_scaled_output, upsample_info);
+
+  // Setup the function to convolve the upscaled output
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+  _conv_f.configure(compile_context, &_scaled_output, &_weights_flipped, bias, output, conv_info,
+                    weights_info);
+  _scaled_output.allocator()->allocate();
+
+  // Setup flip axis data
+  _flip_axis.allocator()->allocate();
+  _flip_axis.map(true);
+  auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+  if (weights->info()->data_layout() == DataLayout::NHWC)
+  {
+    axis_data[0] = 1;
+    axis_data[1] = 2;
+  }
+  else
+  {
+    axis_data[0] = 0;
+    axis_data[1] = 1;
+  }
+  _flip_axis.unmap();
+}
+
+void CLDirectTransposeConvLayer::run()
+{
+  prepare();
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  _scale_f.run();
+  _conv_f.run();
+}
+
+void CLDirectTransposeConvLayer::prepare()
+{
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    // Run weights flipping and mark original weights tensor as unused
+    _weights_flipped.allocator()->allocate();
+    _flip_weights.run();
+    _original_weights->mark_as_unused();
+
+    // Prepare convolution
+    _conv_f.prepare();
+
+    // Free flipped weights
+    if (!_weights_flipped.is_used())
+    {
+      _weights_flipped.allocator()->free();
+    }
+
+    _is_prepared = true;
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
index 3d9a28a48..ae9d8afc6 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
@@ -47,7 +47,7 @@ using namespace arm_compute;
 void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
                                   const ICLTensor *lookups)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+  auto k = support::cpp14::make_unique<CLEmbeddingLookupKernel>();
   k->configure(input, output, lookups);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
index f098832b0..01989461e 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
@@ -45,7 +45,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 #include <algorithm>
 
@@ -60,7 +60,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
   ARM_COMPUTE_UNUSED(weights);
   ARM_COMPUTE_UNUSED(output);
   ARM_COMPUTE_RETURN_ON_ERROR(
-      CLGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output));
+      CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
 
   return Status{};
 }
@@ -68,7 +68,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
 void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
+  auto k = support::cpp14::make_unique<CLTransposeKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
@@ -172,7 +172,8 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen
 
   // Quantize input
   _quantized_input.allocator()->init(
-      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+          DataType::QASYMM8_SIGNED));
   _memory_group.manage(&_quantized_input);
   _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input);
 
@@ -199,7 +200,7 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
 {
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED);
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
   ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
 
@@ -256,8 +257,9 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor));
 
   // Validate quantization symm8 kernel
-  const ITensorInfo &quantized_input = TensorInfo(
-      input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+  const ITensorInfo &quantized_input =
+      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
+          DataType::QASYMM8_SIGNED));
   ARM_COMPUTE_RETURN_ON_ERROR(
       CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input));
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
index 63e291b36..2ff4b9659 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
@@ -46,7 +46,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 #include <algorithm>
 
@@ -141,7 +141,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
 void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
+  auto k = support::cpp14::make_unique<CLTransposeKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
index 9aebc473e..157b4d977 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
@@ -53,18 +53,21 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
       fc->configure(input_to_use, _weights, _biases, _output);
       return std::unique_ptr<arm_compute::IFunction>(fc);
     }
-    else
+    else if (kernel_type == KernelType::PREPROCESSED_WEIGHTS)
     {
-      assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
-
       bool is_hybrid = (input->info()->data_type() == DataType::F32 ||
                         input->info()->data_type() == DataType::F16) &&
-                       weights->info()->data_type() == DataType::S8;
+                       (weights->info()->data_type() == DataType::S8 ||
+                        weights->info()->data_type() == DataType::QASYMM8_SIGNED);
 
       if (is_hybrid)
       {
         auto fc = new arm_compute::CLFullyConnectedHybridLayer{_memory_manager};
+        ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info());
+        const auto orgin_weights_data_type = weights_info->data_type();
+        weights_info->set_data_type(DataType::QASYMM8_SIGNED);
         fc->configure(input_to_use, _weights, _biases, _output);
+        weights_info->set_data_type(orgin_weights_data_type);
         return std::unique_ptr<arm_compute::IFunction>(fc);
       }
       else
@@ -74,6 +77,11 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
         return std::unique_ptr<arm_compute::IFunction>(fc);
       }
     }
+    else
+    {
+      throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type");
+    }
+
   }();
 
   if (_needs_reshape)
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp
deleted file mode 100644
index ca5499dfc..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-using namespace arm_compute::cl_gemm;
-
-namespace
-{
-inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target)
-{
-  return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyCoreEx::CLGEMMLowpMatrixMultiplyCoreEx(
-    std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _mm_midgard_kernel(), _mtx_a_reduction_kernel(),
-      _mtx_b_reduction_kernel(), _vector_sum_col(), _vector_sum_row(), _a_offset(0), _b_offset(0),
-      _reshape_b_only_on_first_run(false), _is_prepared(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyCoreEx::configure(const ICLTensor *a, const ICLTensor *b,
-                                               const ICLTensor *c, ICLTensor *output,
-                                               const GEMMInfo &gemm_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-  ARM_COMPUTE_UNUSED(c);
-  ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCoreEx::validate(
-      a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
-
-  _is_prepared = false;
-  _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-  _a_offset = a->info()->quantization_info().uniform().offset;
-  _b_offset = b->info()->quantization_info().uniform().offset;
-
-  // Get the GPU target
-  const GPUTarget gpu_target = CLScheduler::get().target();
-
-  // Set the target for the kernels
-  _mm_midgard_kernel.set_target(gpu_target);
-
-  // GEMMRHSMatrixInfo rhs_info;
-  // GEMMLHSMatrixInfo lhs_info;
-
-  // Arguments used by GEMMReshapeInfo
-  // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m,
-  // n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
-  // in order to know how the matrices have been reshaped
-  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-  const unsigned int m = reinterpret_input_as_3d
-                             ? (a->info()->dimension(1) * a->info()->dimension(2))
-                             : a->info()->dimension(1);
-  const unsigned int n = b->info()->dimension(0);
-  const unsigned int k = a->info()->dimension(0);
-  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
-
-  const ICLTensor *matrix_b = b;
-  // Configure matrix multiply kernel
-  _mm_midgard_kernel.configure(
-      a, matrix_b, output,
-      GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
-}
-
-Status CLGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b,
-                                                const ITensorInfo *c, const ITensorInfo *output,
-                                                const GEMMInfo &gemm_info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
-  ARM_COMPUTE_UNUSED(c);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(),
-                                  "Matrix A already reshaped is not supported");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(),
-                                  "Matrix B already reshaped is not supported");
-
-  const ITensorInfo *matrix_a_info = a;
-
-  // Get the GPU target
-  const GPUTarget gpu_target = CLScheduler::get().target();
-
-  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-  const unsigned int m =
-      reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-  const unsigned int n = b->dimension(0);
-  const unsigned int k = a->dimension(0);
-  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
-
-  bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), gpu_target);
-
-  const GEMMReshapeInfo reshape_info =
-      GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
-
-  TensorInfo weights_info(*b);
-  const ITensorInfo *matrix_b_info = &weights_info;
-  if (reshape_matrix_b)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(false,
-                                    "CLGEMMLowpMatrixMultiplyCoreEx does not support reshape_b");
-  }
-
-  // Validate matrix multiply
-  ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernelEx::validate(
-      matrix_a_info, matrix_b_info, output, reshape_info));
-
-  return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyCoreEx::run()
-{
-  prepare();
-
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  // Run matrix multiply
-  CLScheduler::get().enqueue(_mm_midgard_kernel, false);
-}
-
-void CLGEMMLowpMatrixMultiplyCoreEx::prepare()
-{
-  if (!_is_prepared)
-  {
-    _is_prepared = true;
-  }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
index f594d7a2e..e0b833b04 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
@@ -48,7 +48,7 @@ using namespace arm_compute;
 void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output,
                            int axis)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLGatherExKernel>();
+  auto k = support::cpp14::make_unique<CLGatherExKernel>();
   k->configure(input, indices, output, axis);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
index 27ed8e828..65b89a389 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
@@ -47,7 +47,7 @@ using namespace arm_compute;
 void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
                                   const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>();
+  auto k = support::cpp14::make_unique<CLHashtableLookupKernel>();
   k->configure(lookups, keys, input, output, hits);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
index 80393e8d1..5a7e40839 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
@@ -50,7 +50,7 @@ CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {}
 void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
                                                ICLTensor *gamma, ICLTensor *beta, float epsilon)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
+  auto k = support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
   k->configure(input, output, gamma, beta, epsilon);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
deleted file mode 100644
index fbb15ab1d..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLPReLU.h"
-
-#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>();
-  k->configure(input, alpha, output);
-  _kernel = std::move(k);
-
-  if (output->info()->dimension(0) > 1)
-  {
-    ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha;
-
-    if (broadcasted_info->info()->dimension(0) == 1)
-    {
-      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-    }
-  }
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
deleted file mode 100644
index 6049b7e70..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLRNNLayerEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-CLRNNLayerEx::CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
-      _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
-      _gemm_output(), _add_output(), _is_prepared(false)
-{
-}
-
-Status CLRNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
-                              const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
-                              const ITensorInfo *hidden_state, const ITensorInfo *output,
-                              const ActivationLayerInfo &info)
-{
-  const int idx_width = 0;
-  const int idx_height = 1;
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
-                                      output);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
-                              recurrent_weights->dimension(idx_width));
-  ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
-                              recurrent_weights->dimension(1));
-  ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
-  ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
-  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
-  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
-                                                     hidden_state->tensor_shape());
-
-  auto shape_info =
-      TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1,
-                 input->data_type());
-
-  ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
-  ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(
-      ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
-  ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
-
-  return Status{};
-}
-
-void CLRNNLayerEx::configure(const ICLTensor *input, const ICLTensor *weights,
-                             const ICLTensor *recurrent_weights, const ICLTensor *bias,
-                             ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
-  ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayerEx::validate(input->info(), weights->info(),
-                                                    recurrent_weights->info(), bias->info(),
-                                                    hidden_state->info(), output->info(), info));
-
-  const int idx_height = 1;
-  TensorShape shape =
-      compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
-
-  _is_prepared = false;
-
-  _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
-  _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
-
-  // Manage intermediate buffers and configure
-  _memory_group.manage(&_fully_connected_out);
-  _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
-
-  _memory_group.manage(&_gemm_output);
-  _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
-
-  _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
-  _memory_group.manage(&_add_output);
-
-  _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output,
-                        &_add_output, ConvertPolicy::SATURATE);
-
-  _fully_connected_out.allocator()->allocate();
-  _gemm_output.allocator()->allocate();
-
-  _activation_kernel.configure(&_add_output, hidden_state, info);
-  _add_output.allocator()->allocate();
-
-  _copy_kernel.configure(hidden_state, output);
-}
-
-void CLRNNLayerEx::run()
-{
-  prepare();
-
-  _memory_group.acquire();
-
-  _fully_connected_kernel.run();
-  _gemm_state_f.run();
-  CLScheduler::get().enqueue(_add_kernel);
-  CLScheduler::get().enqueue(_activation_kernel);
-
-  // copy hidden out to output
-  CLScheduler::get().enqueue(_copy_kernel);
-
-  _memory_group.release();
-}
-
-void CLRNNLayerEx::prepare()
-{
-  if (!_is_prepared)
-  {
-    _fully_connected_kernel.prepare();
-    _gemm_state_f.prepare();
-
-    _is_prepared = true;
-  }
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
index 8ce2d746c..a41e6db60 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -60,8 +60,7 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *
   const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
 
   // Create temporary tensor infos
-  auto interm_tensors =
-      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+  auto interm_tensors = support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
 
   // Create intermediate tensor info
   TensorShape shape{input->tensor_shape()};
@@ -119,9 +118,8 @@ void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
   const size_t num_of_kernels = axis.size();
   const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
 
-  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
-  _reduce_kernels =
-      arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+  _interm_tensors = support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+  _reduce_kernels = support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
 
   // Set a vector that is ordered ICLTensors sequentially.
   std::vector<ICLTensor *> tensors;
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
deleted file mode 100644
index 7d7b2264b..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h"
-
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
-
-using namespace arm_compute;
-
-void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>();
-  k->configure(input, output, block_size);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
index e61746ef2..3215d01a7 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
@@ -15,7 +15,7 @@
  */
 
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,218 +37,124 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/UtilsEx.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CPP/CPPScheduler.h"
 
+#include <cmath>
 #include <memory>
 #include <tuple>
 
 using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
 
-CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _scale_f(),
-      _conv_f(),
-      _flip_weights(),
-      _scaled_output(),
-      _original_weights(nullptr),
-      _weights_flipped(),
-      _is_prepared(false)
+CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_manager(std::move(memory_manager)), _function()
+{
+}
+
+void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
+                                     ICLTensor *output, const PadStrideInfo &deconv_info,
+                                     unsigned int invalid_right, unsigned int invalid_bottom,
+                                     const WeightsInfo &weights_info)
 {
+  configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info,
+            invalid_right, invalid_bottom, weights_info);
+}
+
+void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, ICLTensor *input,
+                                     ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+                                     const PadStrideInfo &deconv_info, unsigned int invalid_right,
+                                     unsigned int invalid_bottom, const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  switch (CLTransposeConvLayer::get_deconvolution_method(input->info(), weights->info(), nullptr,
+                                                         output->info(), deconv_info, invalid_right,
+                                                         invalid_bottom, weights_info))
+  {
+    case DeconvolutionMethod::DIRECT:
+    {
+      auto f = arm_compute::support::cpp14::make_unique<CLDirectTransposeConvLayer>();
+      f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right,
+                   invalid_bottom, weights_info);
+      _function = std::move(f);
+      break;
+    }
+    case DeconvolutionMethod::GEMM:
+    {
+      auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
+      f->configure(compile_context, input, weights, bias, output, deconv_info);
+      _function = std::move(f);
+      break;
+    }
+    default:
+      ARM_COMPUTE_ERROR("Not supported.");
+      break;
+  }
 }
 
 Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
                                       const ITensorInfo *bias, ITensorInfo *output,
-                                      const PadStrideInfo &info, unsigned int invalid_right,
+                                      const PadStrideInfo &deconv_info, unsigned int invalid_right,
                                       unsigned int invalid_bottom, const WeightsInfo &weights_info)
 {
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-
-  const DataLayout data_layout = input->data_layout();
-
-  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-  const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
-
-  const unsigned int kernel_x = weights->dimension(idx_w);
-  const unsigned int kernel_y = weights->dimension(idx_h);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_right > kernel_x - 1,
-                                  "invalid_right must be smaller than kernel_x");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_bottom > kernel_y - 1,
-                                  "inner_border_top must be smaller than kernel_y");
-
-  // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were added.
-  auto out_dims = transposeconv_output_dimensions(
-      input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
-      weights->dimension(idx_h), info, invalid_right, invalid_bottom);
-
-  const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
-
-  if (bias != nullptr)
+  switch (CLTransposeConvLayer::get_deconvolution_method(
+      input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info))
   {
-    if (is_data_type_quantized_asymmetric(input->data_type()))
+    case DeconvolutionMethod::DIRECT:
     {
-      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+      // Validate direct convolution layer
+      ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate(
+          input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info));
+      break;
     }
-    else
+    case DeconvolutionMethod::GEMM:
     {
-      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+      // Validate gemm-based convolution layer
+      ARM_COMPUTE_RETURN_ON_ERROR(
+          CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
+      break;
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+    default:
+      ARM_COMPUTE_ERROR("Not supported.");
+      break;
   }
 
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
-                                  "Output's width is invalid.");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
-                                  "Output's height is invalid.");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
-                                  "Output's depth is invalid.");
-
-  unsigned int pad_left = 0;
-  unsigned int pad_right = 0;
-  unsigned int pad_top = 0;
-  unsigned int pad_bottom = 0;
-  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
-      pad_bottom);
-  TensorInfo scale_out_info(input->clone()
-                                ->set_is_resizable(true)
-                                .reset_padding()
-                                .set_tensor_shape(scale_out_shape)
-                                .set_data_layout(data_layout));
-  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      CLTransposeConvLayerUpsample::validate(input, &scale_out_info, BorderSize(0, 0), info));
-  ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
-                                                           conv_info, weights_info));
-
   return Status{};
 }
 
-void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
-                                     ICLTensor *output, const PadStrideInfo &info,
-                                     unsigned int invalid_right, unsigned int invalid_bottom,
-                                     const WeightsInfo &weights_info)
+DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method(
+    const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias,
+    ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right,
+    unsigned int invalid_bottom, const WeightsInfo &weights_info)
 {
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
-  const unsigned int stride_x = info.stride().first;
-  const unsigned int stride_y = info.stride().second;
+  ARM_COMPUTE_UNUSED(output, bias, weights_info);
 
-  const DataLayout data_layout = input->info()->data_layout();
+  const DataLayout data_layout = input->data_layout();
 
   const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
   const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
-  _original_weights = weights;
-  _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
-  _flip_weights.configure(weights, &_weights_flipped);
-
-  // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were
-  // added.
-  auto out_dims = transposeconv_output_dimensions(
-      input->info()->dimension(idx_w), input->info()->dimension(idx_h),
-      weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
-      invalid_bottom);
-
-  const TensorShape output_shape =
-      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
-
-  // Output auto initialization if not yet initialized
-  auto_init_if_empty(
-      *output->info(),
-      input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
-
-  // Perform validation step
-  ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayer::validate(
-      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
-      info, invalid_right, invalid_bottom));
-
-  _is_prepared = weights_info.retain_internal_weights();
-
-  _memory_group.manage(&_scaled_output);
-
-  // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
-  // to match output shape
-  unsigned int pad_left = 0;
-  unsigned int pad_right = 0;
-  unsigned int pad_top = 0;
-  unsigned int pad_bottom = 0;
-  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
-      pad_right, pad_top, pad_bottom);
-
-  TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
-                            input->info()->quantization_info());
-  scale_out_info.set_data_layout(data_layout);
-  _scaled_output.allocator()->init(scale_out_info);
-
-  // configure scale function
-  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
-                                    DimensionRoundingType::FLOOR);
-  _scale_f.configure(input, &_scaled_output, BorderSize(0, 0), upsample_info);
-
-  // setup the function to convolve the upscaled output
-  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-  _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
-  _scaled_output.allocator()->allocate();
+  if (weights->dimension(idx_w) != deconv_info.stride().first ||
+      weights->dimension(idx_h) != deconv_info.stride().second || invalid_right != 0 ||
+      invalid_bottom != 0)
+  {
+    return DeconvolutionMethod::DIRECT;
+  }
+
+  return DeconvolutionMethod::GEMM;
 }
 
 void CLTransposeConvLayer::run()
 {
   prepare();
-
-  _memory_group.acquire();
-
-  _scale_f.run();
-  _conv_f.run();
-
-  _memory_group.release();
+  _function->run();
 }
 
-void CLTransposeConvLayer::prepare()
-{
-  if (!_is_prepared)
-  {
-    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-    // Run weights flipping and mark original weights tensor as unused
-    _weights_flipped.allocator()->allocate();
-    _weights_flipped.map(true);
-    _original_weights->map(CLScheduler::get().queue(), true);
-    CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
-    _weights_flipped.unmap();
-    _original_weights->unmap(CLScheduler::get().queue());
-    _original_weights->mark_as_unused();
-
-    // Prepare convolution
-    _conv_f.prepare();
-
-    if (!_weights_flipped.is_used())
-    {
-      _weights_flipped.allocator()->free();
-    }
-
-    _is_prepared = true;
-  }
-}
+void CLTransposeConvLayer::prepare() { _function->prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
deleted file mode 100644
index 07feb5a64..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-#include <cmath>
-#include <memory>
-#include <tuple>
-
-using namespace arm_compute;
-
-CLTransposeConvLayerUpsample::CLTransposeConvLayerUpsample() // NOLINT
-    : _upsample(),
-      _output(nullptr)
-{
-}
-
-Status CLTransposeConvLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                              const BorderSize &inner_border,
-                                              const PadStrideInfo &info)
-{
-  return CLTransposeConvLayerUpsampleKernel::validate(input, output, inner_border, info);
-}
-
-void CLTransposeConvLayerUpsample::configure(ICLTensor *input, ICLTensor *output,
-                                             const BorderSize &inner_border,
-                                             const PadStrideInfo &info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  _output = output;
-  _upsample.configure(input, _output, inner_border, info);
-}
-
-void CLTransposeConvLayerUpsample::run()
-{
-  _output->map(CLScheduler::get().queue(), true);
-  if (is_data_type_quantized_asymmetric(_output->info()->data_type()))
-  {
-    const uint8_t quantized_zero = _output->info()->quantization_info().uniform().offset;
-    std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
-  }
-  else
-  {
-    memset(_output->buffer(), 0, _output->info()->total_size());
-  }
-  _output->unmap(CLScheduler::get().queue());
-
-  CLScheduler::get().enqueue(_upsample, false);
-}