27 files changed, 853 insertions, 491 deletions
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp
new file mode 100644
index 000000000..1e52fc429
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLActivationLayerEx.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLActivationLayerEx.h"
+
+#include "arm_compute/core/CL/kernels/CLActivationLayerExKernel.h"
+
+using namespace arm_compute;
+
+void CLActivationLayerEx::configure(ICLTensor *input, ICLTensor *output,
+                                    ActivationLayerInfoEx act_info)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerExKernel>();
+  k->configure(input, output, act_info);
+  _kernel = std::move(k);
+}
+
+Status CLActivationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                     const ActivationLayerInfoEx &act_info)
+{
+  return CLActivationLayerExKernel::validate(input, output, act_info);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp
new file mode 100644
index 000000000..dff743e89
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLArgMinMax.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLArgMinMax.h"
+
+#include "arm_compute/core/CL/kernels/CLArgMinMaxKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+
+CLArgMinMax::CLArgMinMax()
+    : _input(nullptr), _output(nullptr), _argminmax_axis(), _interm_tensors(), _argminmax_kernels(),
+      _num_of_kernels()
+{
+}
+
+void CLArgMinMax::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis,
+                            ArgOperation op)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, op));
+  _input = input;
+  _output = output;
+  _argminmax_axis = axis;
+  _arg_op = op;
+  // NOTE The argminmax_axis must have no duplication.
+  _num_of_kernels = axis.size();
+  const size_t num_of_interm_tensors = _num_of_kernels - 1;
+
+  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+  _argminmax_kernels =
+      arm_compute::support::cpp14::make_unique<CLArgMinMaxKernel[]>(_num_of_kernels);
+
+  TensorShape shape{input->info()->tensor_shape()};
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    shape.set(_argminmax_axis[i], 1);
+    _interm_tensors[i].allocator()->init(
+        TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()));
+    _interm_tensors[i].allocator()->allocate();
+  }
+
+  // Set a vector that is ordered ICLTensors sequentially.
+  std::vector<ICLTensor *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    tensors.emplace_back(_interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Apply ArgMinMax on all kernels
+  for (size_t i = 0; i < _num_of_kernels; i++)
+  {
+    _argminmax_kernels[i].configure(tensors[i], tensors[i + 1], _argminmax_axis[i], op);
+  }
+}
+
+Status CLArgMinMax::validate(const ITensorInfo *input, const std::vector<uint32_t> &argminmax_axis,
+                             const ITensorInfo *output, ArgOperation op)
+{
+  const size_t num_of_kernels = argminmax_axis.size();
+  const size_t num_of_interm_tensors = num_of_kernels - 1;
+
+  // Create temporary tensor infos
+  auto interm_tensors =
+      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+  // Create intermediate tensor info
+  TensorShape shape{input->tensor_shape()};
+
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    shape.set(argminmax_axis[i], 1);
+    interm_tensors[i].set_data_type(input->data_type());
+    interm_tensors[i].set_tensor_shape(shape);
+    interm_tensors[i].set_num_channels(input->num_channels());
+  }
+
+  // Set a vector that is ordered ITensorInfo sequentially.
+  std::vector<const ITensorInfo *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; i++)
+  {
+    tensors.emplace_back(interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Validate argminmax only on all kernels
+  for (size_t i = 0; i < num_of_kernels; i++)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArgMinMaxKernel::validate(tensors[i], tensors[i + 1], argminmax_axis[i], op));
+  }
+
+  return Status{};
+}
+
+void CLArgMinMax::run()
+{
+  for (size_t i = 0; i < _num_of_kernels; ++i)
+  {
+    CLScheduler::get().enqueue(_argminmax_kernels[i]);
+  }
+}
+
+} // namespace arm_compute
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp
new file mode 100644
index 000000000..3f403c80a
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLArithmeticSubtractionEx.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLArithmeticSubtractionEx.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionExKernel.h"
+
+using namespace arm_compute;
+
+void CLArithmeticSubtractionEx::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+                                          ConvertPolicy policy)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionExKernel>();
+  k->configure(input1, input2, output, policy);
+  _kernel = std::move(k);
+
+  if (output->info()->dimension(0) > 1)
+  {
+    ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+    if (broadcasted_info->info()->dimension(0) == 1)
+    {
+      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+    }
+  }
+}
+
+Status CLArithmeticSubtractionEx::validate(const ITensorInfo *input1, const ITensorInfo *input2,
+                                           const ITensorInfo *output, ConvertPolicy policy)
+{
+  return CLArithmeticSubtractionExKernel::validate(input1, input2, output, policy);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp
new file mode 100644
index 000000000..26e3798cc
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLBatchToSpaceND.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLBatchToSpaceND.h"
+
+#include "arm_compute/core/CL/kernels/CLBatchToSpaceNDKernel.h"
+
+using namespace arm_compute;
+
+void CLBatchToSpaceND::configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLBatchToSpaceNDKernel>();
+  k->configure(input, output, block_size);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
new file mode 100644
index 000000000..7c5fe5eda
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h"
+
+#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+                                  BinaryLogicalOperation op)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+  k->configure(input1, input2, output, op);
+  _kernel = std::move(k);
+
+  if (output->info()->dimension(0) > 1)
+  {
+    ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+    if (broadcasted_info->info()->dimension(0) == 1)
+    {
+      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+    }
+  }
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
index e1059ab53..8e106737c 100644
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
@@ -17,7 +17,6 @@
 #include "arm_compute/runtime/CL/functions/CLCast.h"
 
 #include "arm_compute/core/CL/kernels/CLCastKernel.h"
-#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp
new file mode 100644
index 000000000..f6a745a25
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLComparisonOp.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLComparisonOp.h"
+
+#include "arm_compute/core/CL/kernels/CLComparisonOpKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLComparisonOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
+                               const ComparisonOperation &op)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLComparisonOpKernel>();
+  k->configure(input1, input2, output, op);
+  _kernel = std::move(k);
+
+  if (output->info()->dimension(0) > 1)
+  {
+    ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+    if (broadcasted_info->info()->dimension(0) == 1)
+    {
+      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+    }
+  }
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
new file mode 100644
index 000000000..c2e4ca9ff
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h"
+
+#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
+
+using namespace arm_compute;
+
+void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>();
+  k->configure(input, output, block_size);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
new file mode 100644
index 000000000..2781784ca
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
+
+using namespace arm_compute;
+
+void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
+                                  const ICLTensor *lookups)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+  k->configure(input, output, lookups);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp
new file mode 100644
index 000000000..411fa8700
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLExp.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLExp.h"
+
+#include "arm_compute/core/CL/kernels/CLExpKernel.h"
+
+using namespace arm_compute;
+
+void CLExp::configure(const ICLTensor *input, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLExpKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp
index 5552cbc6f..fb056fe45 100644
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp
@@ -16,11 +16,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLGather.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLGatherKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
 
 using namespace arm_compute;
 
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
new file mode 100644
index 000000000..7180e9356
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLHashtableLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h"
+
+using namespace arm_compute;
+
+void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
+                                  const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>();
+  k->configure(lookups, keys, input, output, hits);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
new file mode 100644
index 000000000..be35ea732
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLNeg.h"
+
+#include "arm_compute/core/CL/kernels/CLNegKernel.h"
+
+using namespace arm_compute;
+
+void CLNeg::configure(ICLTensor *input, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>();
+  k->configure(input, output);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp
new file mode 100644
index 000000000..276c4557a
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLNormalizationLayerEx.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayerEx.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLNormalizationLayerEx::CLNormalizationLayerEx() : _norm_kernel(), _border_handler() {}
+
+void CLNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
+                                       const NormalizationLayerInfo &norm_info)
+{
+  ARM_COMPUTE_ERROR_ON(input == nullptr);
+
+  // Configure normalization kernel
+  _norm_kernel.configure(input, output, norm_info);
+
+  // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
+  _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+Status CLNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                        const NormalizationLayerInfo &norm_info)
+{
+  return CLNormalizationLayerExKernel::validate(input, output, norm_info);
+}
+
+void CLNormalizationLayerEx::run()
+{
+  // Run border handler
+  CLScheduler::get().enqueue(_border_handler, false);
+
+  // Run normalization kernel
+  CLScheduler::get().enqueue(_norm_kernel);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
new file mode 100644
index 000000000..38adedd10
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLPReLU.h"
+
+#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>();
+  k->configure(input, alpha, output);
+  _kernel = std::move(k);
+
+  if (output->info()->dimension(0) > 1)
+  {
+    ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha;
+
+    if (broadcasted_info->info()->dimension(0) == 1)
+    {
+      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+    }
+  }
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp
new file mode 100644
index 000000000..5265b6c34
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp
@@ -0,0 +1,28 @@
+/*
+* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+* Copyright (c) 2016-2018 ARM Limited.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+#include "arm_compute/runtime/CL/functions/CLPadLayerEx.h"
+
+#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
+
+using namespace arm_compute;
+
+void CLPadLayerEx::configure(ICLTensor *input, ICLTensor *output, ICLTensor *pad_size)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLPadLayerKernel>();
+  k->configure(input, output, pad_size);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp
new file mode 100644
index 000000000..fb363270d
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPermuteEx.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLPermuteEx.h"
+
+#include "arm_compute/core/CL/kernels/CLPermuteExKernel.h"
+
+using namespace arm_compute;
+
+void CLPermuteEx::configure(const ICLTensor *input, ICLTensor *output,
+                            const PermutationVector &perm)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLPermuteExKernel>();
+  k->configure(input, output, perm);
+  _kernel = std::move(k);
+}
+
+Status CLPermuteEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                             const PermutationVector &perm)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteExKernel::validate(input, output, perm));
+  return Status{};
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp
index e1add5e90..dc0baa8dd 100644
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp
@@ -18,9 +18,6 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
 
 using namespace arm_compute;
 
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp
deleted file mode 100644
index 3382058db..000000000
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/runtime/CL/functions/CLReduceMax.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "support/ToolchainSupport.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h"
-
-#include <vector>
-#include <algorithm>
-
-#include <utility>
-
-#define REDUCE_MAX_RUN_ON_CPU 1
-
-namespace arm_compute
-{
-
-CLReduceMax::CLReduceMax() : _axis(0), _input(nullptr), _output(nullptr), _kernel(nullptr) {}
-
-void CLReduceMax::configure(ICLTensor *input, int axis, ICLTensor *output)
-{
-  _axis = axis;
-
-  _input = input;
-  _output = output;
-
-  auto k = arm_compute::support::cpp14::make_unique<CLReduceMaxKernel>();
-  k->configure(input, axis, output);
-  _kernel = std::move(k);
-
-  // We can handle for simple case only
-  // Output rank: 1
-  // Axis: one axis value, restrict to 1
-  ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().num_dimensions() != 2);
-  ARM_COMPUTE_ERROR_ON(output->info()->tensor_shape().num_dimensions() != 1);
-  ARM_COMPUTE_ERROR_ON(axis != 1);
-}
-
-Status CLReduceMax::validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output)
-{
-  return CLReduceMaxKernel::validate(input, axis, output);
-}
-
-void CLReduceMax::run()
-{
-#if REDUCE_MAX_RUN_ON_CPU
-  run_on_cpu();
-
-  arm_compute::CLScheduler::get().sync();
-#else
-  arm_compute::CLScheduler::get().enqueue(*_kernel);
-#endif
-}
-
-void CLReduceMax::run_on_cpu()
-{
-  cl::CommandQueue q = CLScheduler::get().queue();
-
-  _input->map(q);
-  _output->map(q);
-
-  // Compute by CPU for simple case
-  // Input rank: 2
-  // Output rank: 1
-  // Axis: one axis value, restrict to 1
-
-  float *input_data = (float *)_input->buffer();
-  float *output_data = (float *)_output->buffer();
-
-  std::vector<float> container_max;
-  int cols = _input->info()->tensor_shape()[0];
-  int rows = _input->info()->tensor_shape()[1];
-  container_max.resize(rows);
-
-  // Initialize as 1st element in row
-  float *input_pointer = input_data;
-  for (int i = 0; i < rows; i++)
-  {
-    container_max[i] = *input_pointer;
-    input_pointer += cols;
-  }
-
-  // Update max value in row
-  for (int i = 0; i < rows; i++)
-  {
-    float max_in_row = container_max[i];
-    for (int j = 1; j < cols; j++)
-    {
-      if (max_in_row < input_data[i * cols + j])
-      {
-        max_in_row = input_data[i * cols + j];
-      }
-    }
-    container_max[i] = max_in_row;
-  }
-
-  for (int i = 0; i < rows; i++)
-  {
-    output_data[i] = container_max[i];
-  }
-
-  _input->unmap(q);
-  _output->unmap(q);
-}
-} // namespace arm_compute
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
new file mode 100644
index 000000000..2b8d82706
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLReduceOperation.h"
+
+#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLReduceOperation::CLReduceOperation()
+    : _input(nullptr), _output(nullptr), _axis(), _interm_tensors(), _reduce_kernels()
+{
+}
+
+Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                   const std::set<uint32_t> &axis, const ReduceOperation &op)
+{
+  const size_t num_of_kernels = axis.size();
+  const size_t num_of_interm_tensors = num_of_kernels - 1;
+
+  // Create temporary tensor infos
+  auto interm_tensors =
+      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+
+  // Create intermediate tensor info
+  TensorShape shape{input->tensor_shape()};
+
+  auto it = axis.begin();
+  for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it)
+  {
+    shape.set(*it, 1);
+    interm_tensors[i].set_data_type(input->data_type());
+    interm_tensors[i].set_tensor_shape(shape);
+    interm_tensors[i].set_num_channels(input->num_channels());
+  }
+
+  // Set a vector that is ordered ITensorInfo sequentially.
+  std::vector<const ITensorInfo *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; ++i)
+  {
+    tensors.emplace_back(interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Validate ReduceOperation only on all kernels
+  it = axis.begin();
+  for (size_t i = 0; i < num_of_kernels; ++i, ++it)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op));
+  }
+
+  return Status{};
+}
+
+void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
+                                  const std::set<uint32_t> &axis, ReduceOperation op)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, op));
+
+  _axis = axis;
+
+  _input = input;
+  _output = output;
+
+  // NOTE The axis must have no duplication.
+  const size_t num_of_kernels = axis.size();
+  const size_t num_of_interm_tensors = num_of_kernels - 1;
+
+  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+  _reduce_kernels =
+      arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+
+  TensorShape shape{input->info()->tensor_shape()};
+  auto it = axis.begin();
+  for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it)
+  {
+    shape.set(*it, 1);
+    _interm_tensors[i].allocator()->init(
+        TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()));
+    _interm_tensors[i].allocator()->allocate();
+  }
+
+  // Set a vector that is ordered ICLTensors sequentially.
+  std::vector<ICLTensor *> tensors;
+  tensors.emplace_back(input);
+  for (size_t i = 0; i < num_of_interm_tensors; ++i)
+  {
+    tensors.emplace_back(_interm_tensors.get() + i);
+  }
+  tensors.emplace_back(output);
+
+  // Apply ReduceOperation on all kernels
+  it = axis.begin();
+  for (size_t i = 0; i < num_of_kernels; ++i, ++it)
+  {
+    _reduce_kernels[i].configure(tensors[i], tensors[i + 1], *it, op);
+  }
+}
+
+void CLReduceOperation::run()
+{
+  const size_t num_of_kernels = _axis.size();
+  for (size_t i = 0; i < num_of_kernels; ++i)
+  {
+    CLScheduler::get().enqueue(_reduce_kernels[i]);
+  }
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp
deleted file mode 100644
index ab724e752..000000000
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/runtime/CL/functions/CLReductionMean.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-CLReductionMean::CLReductionMean() : _reduction_mean_kernel(), _fill_border_kernel() {}
-
-Status CLReductionMean::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                 std::vector<uint32_t> axis)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(CLReductionMeanKernel::validate(input, output, axis));
-  return Status{};
-}
-
-void CLReductionMean::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis)
-{
-  _reduction_mean_kernel.configure(input, output, axis);
-  _fill_border_kernel.configure(input, _reduction_mean_kernel.border_size(), BorderMode::CONSTANT,
-                                PixelValue(0));
-}
-
-void CLReductionMean::run()
-{
-  CLScheduler::get().enqueue(_fill_border_kernel);
-  CLScheduler::get().enqueue(_reduction_mean_kernel);
-}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
new file mode 100644
index 000000000..c03826891
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSpaceToBatchND.h"
+
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
+
+using namespace arm_compute;
+
+void CLSpaceToBatchND::configure(const ICLTensor *input, const ICLTensor *block_size,
+                                 const ICLTensor *padding_size, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLSpaceToBatchNDKernel>();
+  k->configure(input, block_size, padding_size, output);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
new file mode 100644
index 000000000..0f455f96f
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h"
+
+#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+
+using namespace arm_compute;
+
+void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>();
+  k->configure(input, output, block_size);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp
new file mode 100644
index 000000000..dc6e4af44
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLSquaredDifference.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSquaredDifference.h"
+
+#include "arm_compute/core/CL/kernels/CLSquaredDifferenceKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+void CLSquaredDifference::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLSquaredDifferenceKernel>();
+  k->configure(input1, input2, output);
+  _kernel = std::move(k);
+
+  if (output->info()->dimension(0) > 1)
+  {
+    ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+    if (broadcasted_info->info()->dimension(0) == 1)
+    {
+      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+    }
+  }
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp
deleted file mode 100644
index cd576cec1..000000000
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
-#include "arm_compute/core/utils/misc/Utility.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-#include <vector>
-
-using namespace arm_compute;
-
-static const int32_t maxDims = 4;
-
-// Return the index for the first element along that axis. This index will be a
-// positive integer between [0, axisSize - 1] that can be used to index
-// directly into the data.
-inline int32_t StartForAxis(int32_t beginMask, std::vector<int32_t> const &startIndices,
-                            std::vector<int32_t> const &strides, const TensorShape &inputShape,
-                            int32_t axis)
-{
-  // Begin with the specified index
-  int32_t start = startIndices[axis];
-
-  // beginMask override
-  if (beginMask & 1 << axis)
-  {
-    if (strides[axis] > 0)
-    {
-      // Forward iteration - use the first element. These values will get
-      // clamped below (Note: We could have set them to 0 and axisSize-1, but
-      // use lowest() and max() to maintain symmetry with StopForAxis())
-      start = std::numeric_limits<int32_t>::lowest();
-    }
-    else
-    {
-      // Backward iteration - use the last element.
-      start = std::numeric_limits<int32_t>::max();
-    }
-  }
-
-  // Handle negative indices
-  int32_t axisSize = inputShape[axis];
-  if (start < 0)
-  {
-    start += axisSize;
-  }
-
-  // Clamping
-  start = arm_compute::utility::clamp(start, 0, axisSize - 1);
-
-  return start;
-}
-
-// Return the "real" index for the end of iteration along that axis. This is an
-// "end" in the traditional C sense, in that it points to one past the last
-// element. ie. So if you were iterating through all elements of a 1D array of
-// size 4, this function would return 4 as the stop, because it is one past the
-// "real" indices of 0, 1, 2 & 3.
-inline int32_t StopForAxis(int32_t endMask, std::vector<int32_t> const &stopIndices,
-                           std::vector<int32_t> const &strides, const TensorShape &inputShape,
-                           int32_t axis)
-{
-  // Begin with the specified index
-  int32_t stop = stopIndices[axis];
-
-  // endMask override
-  if (endMask & (1 << axis))
-  {
-    if (strides[axis] > 0)
-    {
-      // Forward iteration - use the last element. These values will get
-      // clamped below
-      stop = std::numeric_limits<int32_t>::max();
-    }
-    else
-    {
-      // Backward iteration - use the first element.
-      stop = std::numeric_limits<int32_t>::lowest();
-    }
-  }
-
-  // Handle negative indices
-  int32_t axisSize = inputShape[axis];
-  if (stop < 0)
-  {
-    stop += axisSize;
-  }
-
-  // Clamping
-  // Because the end index points one past the last element, we need slightly
-  // different clamping ranges depending on the direction.
-  if (strides[axis] > 0)
-  {
-    // Forward iteration
-    stop = arm_compute::utility::clamp(stop, 0, axisSize);
-  }
-  else
-  {
-    // Backward iteration
-    stop = arm_compute::utility::clamp(stop, -1, axisSize - 1);
-  }
-
-  return stop;
-}
-
-inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w)
-{
-  int32_t offset = b * shape[2] * shape[1] * shape[0];
-  offset += d * shape[1] * shape[0];
-  offset += h * shape[0];
-  offset += w;
-  return offset;
-}
-
-void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
-                               ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask,
-                               int32_t endMask, int32_t shrinkAxisMask)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
-  k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask);
-  _kernel = std::move(k);
-}
-
-void CLStridedSliceCPU::configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
-                                  ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask,
-                                  int32_t endMask, int32_t shrinkAxisMask)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(CLStridedSliceKernel::validate(
-      input->info(), output->info(), beginData->info(), endData->info(), stridesData->info(),
-      beginMask, endMask, shrinkAxisMask));
-
-  _input = input;
-  _output = output;
-  _beginData = beginData;
-  _endData = endData;
-  _stridesData = stridesData;
-  _beginMask = beginMask;
-  _endMask = endMask;
-  _shrinkAxisMask = shrinkAxisMask;
-}
-
-void CLStridedSliceCPU::run()
-{
-  run_on_cpu();
-
-  arm_compute::CLScheduler::get().sync();
-}
-
-inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride)
-{
-  if (stride > 0)
-  {
-    return ((stop - start - 1) / stride) + 1;
-  }
-  else
-  {
-    return ((stop - start + 1) / stride) + 1;
-  }
-}
-
-template <typename T>
-inline void StridedSlice(const T *inputData, const TensorShape &inputShape, int32_t beginMask,
-                         int32_t endMask, const std::vector<int32_t> &startIndices,
-                         const std::vector<int32_t> &stopIndices,
-                         const std::vector<int32_t> &strides, T *outputData)
-{
-  ARM_COMPUTE_ERROR_ON(startIndices.size() != maxDims);
-  ARM_COMPUTE_ERROR_ON(stopIndices.size() != maxDims);
-  ARM_COMPUTE_ERROR_ON(strides.size() != maxDims);
-
-  const int32_t start_b = StartForAxis(beginMask, startIndices, strides, inputShape, 3);
-  const int32_t stop_b = StopForAxis(endMask, stopIndices, strides, inputShape, 3);
-  const int32_t start_d = StartForAxis(beginMask, startIndices, strides, inputShape, 2);
-  const int32_t stop_d = StopForAxis(endMask, stopIndices, strides, inputShape, 2);
-  const int32_t start_h = StartForAxis(beginMask, startIndices, strides, inputShape, 1);
-  const int32_t stop_h = StopForAxis(endMask, stopIndices, strides, inputShape, 1);
-  const int32_t start_w = StartForAxis(beginMask, startIndices, strides, inputShape, 0);
-  const int32_t stop_w = StopForAxis(endMask, stopIndices, strides, inputShape, 0);
-
-  // The shape of outputData may collapse in one-dimension.
-  // Therefore, it is necessary to create a shape that matches the result of the outputData.
-  TensorShape outputShape(
-      getOutDim(start_w, stop_w, strides[0]), getOutDim(start_h, stop_h, strides[1]),
-      getOutDim(start_d, stop_d, strides[2]), getOutDim(start_b, stop_b, strides[3]));
-  for (int32_t in_b = start_b, b = 0; strides[3] > 0 ? in_b < stop_b : in_b > stop_b;
-       in_b += strides[3], b++)
-  {
-    for (int32_t in_d = start_d, d = 0; strides[2] > 0 ? in_d < stop_d : in_d > stop_d;
-         in_d += strides[2], d++)
-    {
-      for (int32_t in_h = start_h, h = 0; strides[1] > 0 ? in_h < stop_h : in_h > stop_h;
-           in_h += strides[1], h++)
-      {
-        for (int32_t in_w = start_w, w = 0; strides[0] > 0 ? in_w < stop_w : in_w > stop_w;
-             in_w += strides[0], w++)
-        {
-          outputData[offset4D(outputShape, b, d, h, w)] =
-              inputData[offset4D(inputShape, in_b, in_d, in_h, in_w)];
-        }
-      }
-    }
-  }
-}
-
-void CLStridedSliceCPU::run_on_cpu()
-{
-  // TODO: Support shrinkAxisMask
-  cl::CommandQueue q = CLScheduler::get().queue();
-
-  _input->map(q);
-  _output->map(q);
-  _beginData->map(q);
-  _endData->map(q);
-  _stridesData->map(q);
-
-  TensorShape inputShape = _input->info()->tensor_shape();
-  TensorShape outputShape = _output->info()->tensor_shape();
-
-  std::vector<int32_t> starts;
-  std::vector<int32_t> stops;
-  std::vector<int32_t> strides;
-
-  for (uint32_t idx = 0; idx <= _input->info()->num_dimensions() - 1; ++idx)
-  {
-    starts.emplace_back(reinterpret_cast<int32_t *>(_beginData->buffer())[idx]);
-    stops.emplace_back(reinterpret_cast<int32_t *>(_endData->buffer())[idx]);
-    strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[idx]);
-  }
-
-  for (uint32_t i = _input->info()->num_dimensions(); i < maxDims; i++)
-  {
-    starts.emplace_back(0);
-    stops.emplace_back(1);
-    strides.emplace_back(1);
-  }
-
-  switch (_input->info()->data_type())
-  {
-    case DataType::U8:
-    case DataType::QASYMM8:
-      StridedSlice(reinterpret_cast<const uint8_t *>(_input->buffer()), inputShape, _beginMask,
-                   _endMask, starts, stops, strides,
-                   reinterpret_cast<uint8_t *>(_output->buffer()));
-      break;
-    case DataType::S8:
-    case DataType::QS8:
-      StridedSlice(reinterpret_cast<const int8_t *>(_input->buffer()), inputShape, _beginMask,
-                   _endMask, starts, stops, strides, reinterpret_cast<int8_t *>(_output->buffer()));
-      break;
-    case DataType::U16:
-      StridedSlice(reinterpret_cast<const uint16_t *>(_input->buffer()), inputShape, _beginMask,
-                   _endMask, starts, stops, strides,
-                   reinterpret_cast<uint16_t *>(_output->buffer()));
-      break;
-    case DataType::S16:
-    case DataType::QS16:
-      StridedSlice(reinterpret_cast<const int16_t *>(_input->buffer()), inputShape, _beginMask,
-                   _endMask, starts, stops, strides,
-                   reinterpret_cast<int16_t *>(_output->buffer()));
-      break;
-    case DataType::F16:
-      // Not sure this works.
-      StridedSlice(reinterpret_cast<const half *>(_input->buffer()), inputShape, _beginMask,
-                   _endMask, starts, stops, strides, reinterpret_cast<half *>(_output->buffer()));
-      break;
-    case DataType::U32:
-      StridedSlice(reinterpret_cast<const uint32_t *>(_input->buffer()), inputShape, _beginMask,
-                   _endMask, starts, stops, strides,
-                   reinterpret_cast<uint32_t *>(_output->buffer()));
-      break;
-    case DataType::S32:
-      StridedSlice(reinterpret_cast<const int32_t *>(_input->buffer()), inputShape, _beginMask,
-                   _endMask, starts, stops, strides,
-                   reinterpret_cast<int32_t *>(_output->buffer()));
-      break;
-    case DataType::F32:
-      StridedSlice(reinterpret_cast<const float *>(_input->buffer()), inputShape, _beginMask,
-                   _endMask, starts, stops, strides, reinterpret_cast<float *>(_output->buffer()));
-      break;
-    default:
-      ARM_COMPUTE_ERROR("DataType not supported");
-      break;
-  }
-
-  _input->unmap(q);
-  _output->unmap(q);
-  _beginData->unmap(q);
-  _endData->unmap(q);
-  _stridesData->unmap(q);
-}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp
new file mode 100644
index 000000000..be7353493
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSliceEx.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLStridedSliceEx.h"
+
+#include "arm_compute/core/CL/kernels/CLStridedSliceExKernel.h"
+
+using namespace arm_compute;
+
+void CLStridedSliceEx::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
+                                 ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask,
+                                 int32_t endMask, int32_t shrinkAxisMask)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceExKernel>();
+  k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask);
+  _kernel = std::move(k);
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
index 6426364c9..19177497c 100644
--- a/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp
@@ -15,12 +15,9 @@
  * limitations under the License.
  */
 #include "arm_compute/runtime/CL/functions/CLTopKV2.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-
-#include <vector>
-#include <algorithm>
 
 #include "../../topk_v2.h"