37 files changed, 2194 insertions, 403 deletions
diff --git a/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h b/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h
index 502a1ee0e..83ae7c17b 100644
--- a/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h
+++ b/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+/**
+ * @file        FeatureLoggingLayer.h
+ * @brief       This file contains FeatureLoggingLayer class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
 #ifndef __FEATURE_LOGGING_LAYER_H__
 #define __FEATURE_LOGGING_LAYER_H__
 
@@ -27,9 +33,24 @@
 
 #include "internal/arm_compute.h"
 
+/**
+ * @brief Class to run FeatureLogging Layer
+ */
 class FeatureLoggingLayer : public ::arm_compute::IFunction
 {
 public:
+  FeatureLoggingLayer(void) : _tag(""), _target(nullptr)
+  {
+    // DO NOTHING
+  }
+
+public:
+  /**
+   * @brief Configure the layer
+   * @param[in] tag Text tag for this layer
+   * @param[in] target The feature tensor to be printed
+   * @return N/A
+   */
   void configure(const std::string &tag, ::arm_compute::ITensor *target)
   {
     _tag = tag;
@@ -37,6 +58,10 @@ public:
   }
 
 public:
+  /**
+   * @brief Run the operation. Must be called after configure().
+   * @return N/A
+   */
   void run(void) override
   {
     if (::internal::arm_compute::isGpuMode())
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc
index 311284efc..28789a801 100644
--- a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc
@@ -17,8 +17,6 @@
 #include "GenericFullyConnectedLayer.h"
 #include "internal/arm_compute.h"
 
-#include <arm_compute/core/Helpers.h>
-
 void GenericFullyConnectedLayer::configure(::arm_compute::ITensor *input,
                                            ::arm_compute::ITensor *weights,
                                            ::arm_compute::ITensor *biases,
@@ -56,9 +54,9 @@ void GenericFullyConnectedLayer::configure(::arm_compute::ITensor *input,
     {
       // reshape
       auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
-      _generic_reshape.configure(CAST_NE(_input), &_neon_buffer);
+      _generic_reshape.configure(_input, &_neon_buffer);
 
-      _neon_fc.configure(&_neon_buffer, CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output));
+      _neon_fc.configure(&_neon_buffer, _weights, _biases, _output);
 
       // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate
       // here.
@@ -66,7 +64,7 @@ void GenericFullyConnectedLayer::configure(::arm_compute::ITensor *input,
     }
     else
     {
-      _neon_fc.configure(CAST_NE(_input), CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output));
+      _neon_fc.configure(_input, _weights, _biases, _output);
     }
   }
 }
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h
index 55d8683da..f1519f54d 100644
--- a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h
@@ -14,23 +14,52 @@
  * limitations under the License.
  */
 
+/**
+ * @file        GenericFullyConnectedLayer.h
+ * @brief       This file contains GenericFullyConnectedLayer class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
 #ifndef __GENERIC_FULLY_CONNECTED_LAYER_H__
 #define __GENERIC_FULLY_CONNECTED_LAYER_H__
 
-#include <arm_compute/runtime/Tensor.h>
-#include <arm_compute/runtime/CL/CLTensor.h>
 #include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
 #include "internal/layers/GenericReshapeLayer.h"
 
+/**
+ * @brief Class to run FullyConnected Layer with both CPU and GPU
+ */
 class GenericFullyConnectedLayer : public ::arm_compute::IFunction
 {
 public:
+  GenericFullyConnectedLayer(void)
+      : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{},
+        _neon_buffer{}, _cl_fc{}, _neon_fc{}, _generic_reshape{}, _needs_reshape(false)
+  {
+    // DO NOTHING
+  }
+
+public:
+  /**
+   * @brief Configure the layer
+   * @param[in] input The source tensor
+   * @param[in] weights The tensor that is filled with weight values
+   * @param[in] biases The tensor that is filled with biase values
+   * @param[in] output The destination tensor
+   * @param[in] needs_reshape Whether it needs to be reshaped or not
+   * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true.
+   * @return N/A
+   */
   void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *weights,
                  ::arm_compute::ITensor *biases, ::arm_compute::ITensor *output, bool needs_reshape,
                  ::arm_compute::TensorShape reshape);
 
 public:
+  /**
+   * @brief Run the operation. Must be called after configure().
+   * @return N/A
+   */
   void run(void) override;
 
 private:
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc
index 2cdfe1b6e..c38c2e9e3 100644
--- a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc
@@ -43,8 +43,8 @@ void GenericReshapeLayer::configure(::arm_compute::ITensor *input, ::arm_compute
   }
   else
   {
-    _neon_permute.configure(CAST_NE(input), &_neon_permuted, pv);
-    _neon_reshape.configure(&_neon_permuted, CAST_NE(output));
+    _neon_permute.configure(input, &_neon_permuted, pv);
+    _neon_reshape.configure(&_neon_permuted, output);
 
     // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
     _neon_permuted.allocator()->allocate();
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h
index 1def21085..a22c14c8b 100644
--- a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+/**
+ * @file        GenericReshapeLayer.h
+ * @brief       This file contains GenericReshapeLayer class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
 #ifndef __GENERIC_RESHAPE_LAYER_H__
 #define __GENERIC_RESHAPE_LAYER_H__
 
@@ -25,12 +31,33 @@
 #include <arm_compute/runtime/NEON/functions/NEPermute.h>
 #include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>
 
+/**
+ * @brief Class to run Reshape Layer with both CPU and GPU
+ */
 class GenericReshapeLayer : public ::arm_compute::IFunction
 {
 public:
+  GenericReshapeLayer(void)
+      : _input(nullptr), _output(nullptr), _cl_permuted{}, _neon_permuted{}, _cl_permute{},
+        _cl_reshape{}, _neon_permute{}, _neon_reshape{}
+  {
+    // DO NOTHING
+  }
+
+public:
+  /**
+   * @brief Configure the layer
+   * @param[in] input The source tensor
+   * @param[in] output The destination tensor
+   * @return N/A
+   */
   void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output);
 
 public:
+  /**
+   * @brief Run the operation. Must be called after configure().
+   * @return N/A
+   */
   void run(void) override;
 
 private:
diff --git a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc
deleted file mode 100644
index 4a5370587..000000000
--- a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <iostream>
-#include "PadLayer.h"
-#include <arm_compute/runtime/CL/CLScheduler.h>
-
-void PadLayer::configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output,
-                         unsigned int border_width)
-{
-  _input = input;
-  _output = output;
-  _border_width = border_width;
-  _output_height = _output->info()->dimension(0);
-  _output_width = _output->info()->dimension(1);
-
-  uint8_t constant_border_value = 0;
-  ::arm_compute::PixelValue constant_pixel_value = ::arm_compute::PixelValue(constant_border_value);
-
-  unsigned int padding_size = _border_width;
-  input->info()->extend_padding(::arm_compute::PaddingSize{padding_size});
-  _fillborderkernel.configure(input, _border_width, ::arm_compute::BorderMode::CONSTANT,
-                              constant_pixel_value);
-}
-
-void PadLayer::run(void)
-{
-  _fillborderkernel.run();
-
-  ::arm_compute::Coordinates coordinates =
-      ::arm_compute::Coordinates(-_border_width, -_border_width);
-  ::arm_compute::TensorShape new_tensor_shape =
-      ::arm_compute::TensorShape(_output_height, _output_width);
-
-  /* NOTE: The cl kernel fills the data in the borders(not in the tensor).
-           Once the tensor is received back at NNAPI, we are adjusting
-           the valid region in such a way that the padding becomes part of the tensor itself
-           and matches the size of output. */
-  _input->info()->set_valid_region(::arm_compute::ValidRegion(coordinates, new_tensor_shape));
-
-  /* NOTE: Since cl kernel does not have an argument for output tensor while NNAPI does.
-           We need to map the input (tensor that is passed to the cl kernel) back to
-           output. */
-
-  // TODO: Write a modified CLCopy kernel to do this job.
-  populateOutput();
-}
-
-void PadLayer::populateOutput()
-{
-  auto &queue = ::arm_compute::CLScheduler::get().queue();
-  _input->map(queue);
-  _output->map(queue);
-
-  auto input_tensor = static_cast<::arm_compute::ITensor *>(_input);
-  auto const source_data = input_tensor->buffer();
-
-  auto output_tensor = static_cast<::arm_compute::ITensor *>(_output);
-  auto dst_data = output_tensor->buffer();
-
-  memmove(dst_data, source_data, _output_height * _output_width * 4);
-
-  _input->unmap(queue);
-  _output->unmap(queue);
-}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleArgMinMax.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleArgMinMax.cc
new file mode 100644
index 000000000..6d348e814
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleArgMinMax.cc
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "internal/layers/SimpleArgMinMax.h"
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void SimpleArgMinMax::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output,
+                                std::vector<uint32_t> axis, ::arm_compute::ArgOperation op)
+{
+  _input = input;
+  _output = output;
+  _axis = axis;
+  _input_rank = input->info()->num_dimensions();
+  _op_type = op;
+}
+
+inline const ::arm_compute::TensorShape
+inferOutputShape(const ::arm_compute::TensorShape &input_shape, const std::vector<uint32_t> &axis,
+                 int input_rank)
+{
+  ::arm_compute::TensorShape out_shape{};
+  size_t dim = 1;
+  for (int i = 0; i < input_rank; ++i)
+  {
+    dim = input_shape[i];
+    out_shape.set(i, dim);
+  }
+
+  for (int i = 0; i < axis.size(); ++i)
+  {
+    out_shape.set(axis[i], 1);
+  }
+
+  return out_shape;
+}
+
+template <typename T>
+inline T getArgMinMaxEle(const ::arm_compute::ITensor *input,
+                         const ::arm_compute::TensorShape &input_shape,
+                         const ::arm_compute::TensorShape &output_shape, const size_t b,
+                         const size_t d, const size_t h, const size_t w, const int axis,
+                         const ::arm_compute::ArgOperation op_type)
+{
+  // If output[dimention] == 1, will check all values of that dimension because of reducing
+  // dimension.
+  // Else will check only one value.
+  const size_t start_b = output_shape[3] == 1 ? 0 : b;
+  const size_t start_d = output_shape[2] == 1 ? 0 : d;
+  const size_t start_h = output_shape[1] == 1 ? 0 : h;
+  const size_t start_w = output_shape[0] == 1 ? 0 : w;
+  const size_t stop_b = output_shape[3] == 1 ? input_shape[3] - 1 : b;
+  const size_t stop_d = output_shape[2] == 1 ? input_shape[2] - 1 : d;
+  const size_t stop_h = output_shape[1] == 1 ? input_shape[1] - 1 : h;
+  const size_t stop_w = output_shape[0] == 1 ? input_shape[0] - 1 : w;
+
+  ::arm_compute::Coordinates id{w, h, d, b};
+  ::arm_compute::Coordinates min_max_id{w, h, d, b};
+
+  T value = *reinterpret_cast<T *>(input->ptr_to_element(id));
+  T tval = *reinterpret_cast<T *>(input->ptr_to_element(id));
+
+  for (size_t in_b = start_b; in_b <= stop_b; ++in_b)
+  {
+    id.set(3, in_b);
+    for (size_t in_d = start_d; in_d <= stop_d; ++in_d)
+    {
+      id.set(2, in_d);
+      for (size_t in_h = start_h; in_h <= stop_h; ++in_h)
+      {
+        id.set(1, in_h);
+        for (size_t in_w = start_w; in_w <= stop_w; ++in_w)
+        {
+          id.set(0, in_w);
+          if (op_type == ::arm_compute::ArgOperation::MIN)
+          {
+            value = std::min<T>(value, *reinterpret_cast<T *>(input->ptr_to_element(id)));
+          }
+          else if (op_type == ::arm_compute::ArgOperation::MAX)
+          {
+            value = std::max<T>(value, *reinterpret_cast<T *>(input->ptr_to_element(id)));
+          }
+          else
+            throw std::runtime_error("This Arg operation is not supported, yet");
+
+          if (tval != value)
+          {
+            min_max_id = id;
+            tval = value;
+          }
+        }
+      }
+    }
+  }
+
+  return min_max_id[axis];
+}
+
+template <typename T>
+inline void
+getArgMinMax(const ::arm_compute::ITensor *input, const ::arm_compute::TensorShape &input_shape,
+             const ::arm_compute::TensorShape &output_shape, ::arm_compute::ITensor *output,
+             const int axis, const ::arm_compute::ArgOperation op_type)
+{
+  ::arm_compute::Coordinates id;
+  for (size_t out_b = 0; out_b < output_shape[3]; ++out_b)
+  {
+    id.set(3, out_b);
+    for (size_t out_d = 0; out_d < output_shape[2]; ++out_d)
+    {
+      id.set(2, out_d);
+      for (size_t out_h = 0; out_h < output_shape[1]; ++out_h)
+      {
+        id.set(1, out_h);
+        for (size_t out_w = 0; out_w < output_shape[0]; ++out_w)
+        {
+          id.set(0, out_w);
+          *reinterpret_cast<int *>(output->ptr_to_element(id)) = getArgMinMaxEle<T>(
+              input, input_shape, output_shape, out_b, out_d, out_h, out_w, axis, op_type);
+        }
+      }
+    }
+  }
+}
+
+void SimpleArgMinMax::run()
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->map(q);
+    CAST_CL(_output)->map(q);
+  }
+
+  ::arm_compute::TensorShape input_shape = _input->info()->tensor_shape();
+
+  // Axis dimension is 1 and size is 1.
+  // TODO support axis size > 1.
+  int axis_val = _axis[0];
+  ::arm_compute::TensorShape output_shape = inferOutputShape(input_shape, _axis, _input_rank);
+
+  _output->info()->set_tensor_shape(output_shape);
+  switch (_input->info()->data_type())
+  {
+    case ::arm_compute::DataType::QASYMM8:
+      getArgMinMax<uint8_t>(_input, input_shape, output_shape, _output, axis_val, _op_type);
+      break;
+    case ::arm_compute::DataType::S32:
+      getArgMinMax<int32_t>(_input, input_shape, output_shape, _output, axis_val, _op_type);
+      break;
+    case ::arm_compute::DataType::F32:
+      getArgMinMax<float>(_input, input_shape, output_shape, _output, axis_val, _op_type);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("DataType not supported");
+      break;
+  }
+
+  _output->info()->set_tensor_shape(output_shape);
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->unmap(q);
+    CAST_CL(_output)->unmap(q);
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleArgMinMax.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleArgMinMax.h
new file mode 100644
index 000000000..b90e74579
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleArgMinMax.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SIMPLE_ARG_MIN_MAX_H__
+#define __SIMPLE_ARG_MIN_MAX_H__
+
+#include "internal/arm_compute.h"
+#include "arm_compute/core/TypesEx.h"
+
+class SimpleArgMinMax : public ::arm_compute::IFunction
+{
+public:
+  SimpleArgMinMax(void) : _input(nullptr), _output(nullptr), _axis(), _input_rank(0)
+  {
+    // DO NOTHING
+  }
+
+public:
+  /** Initialise input and output
+   *
+   * @param[in]  input       First tensor input.
+   * @param[out] output      Output tensor.
+   * @param[in]  axis        Dimension along which to find Min or Max Index.
+   */
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output,
+                 std::vector<uint32_t> axis, ::arm_compute::ArgOperation _op_type);
+
+  void run() override;
+
+private:
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_output;
+  std::vector<uint32_t> _axis;
+  int _input_rank;
+  ::arm_compute::ArgOperation _op_type;
+};
+
+#endif /*__SIMPLE_ARG_MIN_MAX_H__ */
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h
index 31c927b4f..aed9ae286 100644
--- a/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h
@@ -14,15 +14,36 @@
  * limitations under the License.
  */
 
+/**
+ * @file        SimpleArithmeticAddition.h
+ * @brief       This file contains SimpleArithmeticAddition class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
 #ifndef __SIMPLE_ARITHMETIC_ADDITION_H__
 #define __SIMPLE_ARITHMETIC_ADDITION_H__
 
 #include "internal/arm_compute.h"
 #include <arm_compute/core/ITensor.h>
 
+/**
+ * @brief Class to run SimpleArithmeticAddition Layer
+ */
 class SimpleArithmeticAddition : public ::arm_compute::IFunction
 {
 public:
+  SimpleArithmeticAddition(void) : _lhs(nullptr), _rhs(nullptr), _out(nullptr)
+  {
+    // DO NOTHING
+  }
+
+  /**
+   * @brief Configure the layer
+   * @param[in] lhs Lefthand-side operand
+   * @param[in] rhs Righthand-side operand
+   * @param[in] out The destination tensor(Result operand)
+   * @return N/A
+   */
   void configure(::arm_compute::ITensor *lhs, ::arm_compute::ITensor *rhs,
                  ::arm_compute::ITensor *out)
   {
@@ -32,6 +53,10 @@ public:
   }
 
 public:
+  /**
+   * @brief Run the operation. Must be called after configure().
+   * @return N/A
+   */
   void run(void) override
   {
     if (::internal::arm_compute::isGpuMode())
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.cc
new file mode 100644
index 000000000..87175ee1a
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.cc
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "internal/layers/SimpleBatchToSpaceNd.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void SimpleBatchToSpaceND::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output,
+                                     const int32_t *block_size,
+                                     const ::arm_compute::Coordinates &axises)
+{
+  const auto rank = axises.num_dimensions();
+  assert(rank == 4);
+
+  for (int i = 0; i < rank; ++i)
+    assert(axises[i] >= 0 && axises[i] < rank);
+
+  _input = input;
+  _output = output;
+  _block_size = block_size;
+  _axises = axises;
+}
+
+template <typename T>
+inline void BatchToSpaceND(const ::arm_compute::ITensor *input,
+                           const ::arm_compute::TensorShape &input_shape,
+                           const int32_t *block_size_data, ::arm_compute::ITensor *output,
+                           const ::arm_compute::TensorShape &output_shape,
+                           const ::arm_compute::Coordinates &axises)
+{
+  const int output_batch = output_shape[axises[0]];
+  const int output_height = output_shape[axises[1]];
+  const int output_width = output_shape[axises[2]];
+  const int depth = output_shape[axises[3]];
+
+  for (int out_b = 0; out_b < output_batch; ++out_b)
+  {
+    for (int out_h = 0; out_h < output_height; ++out_h)
+    {
+      for (int out_w = 0; out_w < output_width; ++out_w)
+      {
+        for (int out_d = 0; out_d < depth; ++out_d)
+        {
+          const int in_d = out_d;
+          const int in_h = out_h / block_size_data[0];
+          const int in_w = out_w / block_size_data[1];
+          const int in_b =
+              out_b +
+              ((out_h % block_size_data[0]) * block_size_data[1] + out_w % block_size_data[1]) *
+                  output_batch;
+
+          auto input_id =
+              asARMComputeCoordinates(::arm_compute::Coordinates{in_b, in_h, in_w, in_d}, axises);
+          auto output_id = asARMComputeCoordinates(
+              ::arm_compute::Coordinates{out_b, out_h, out_w, out_d}, axises);
+
+          *reinterpret_cast<T *>(output->ptr_to_element(output_id)) =
+              *reinterpret_cast<T *>(input->ptr_to_element(input_id));
+        }
+      }
+    }
+  }
+}
+void SimpleBatchToSpaceND::run()
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->map(q);
+    CAST_CL(_output)->map(q);
+  }
+
+  switch (_input->info()->data_type())
+  {
+    case ::arm_compute::DataType::U8:
+    case ::arm_compute::DataType::QASYMM8:
+      BatchToSpaceND<uint8_t>(_input, _input->info()->tensor_shape(), _block_size, _output,
+                              _output->info()->tensor_shape(), _axises);
+      break;
+    case ::arm_compute::DataType::F32:
+      BatchToSpaceND<float>(_input, _input->info()->tensor_shape(), _block_size, _output,
+                            _output->info()->tensor_shape(), _axises);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("DataType not supported");
+      break;
+  }
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->unmap(q);
+    CAST_CL(_output)->unmap(q);
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.h
new file mode 100644
index 000000000..5695d9719
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.h
@@ -0,0 +1,51 @@
+/*
+ *Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SIMPLE_BATCH_TO_SPACE_ND_H__
+#define __SIMPLE_BATCH_TO_SPACE_ND_H__
+
+#include "internal/arm_compute.h"
+#include "internal/arm_compute/Cast.h"
+
+class SimpleBatchToSpaceND : public ::arm_compute::IFunction
+{
+public:
+  SimpleBatchToSpaceND(void) : _input(nullptr), _output(nullptr), _block_size(nullptr), _axises{}
+  {
+    // DO NOTHING
+  }
+
+  /** Initialise input and output
+   *
+   * @param[in]  input       First tensor input.
+   * @param[out] output      Output tensor.
+   * @param[in]  block_size  Block size.
+   * @param[in]  axises      Axises of rank 4
+   */
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output,
+                 const int32_t *block_size,
+                 const ::arm_compute::Coordinates &axises = getARMComputeAxises(4));
+
+  void run() override;
+
+private:
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_output;
+  const int32_t *_block_size;
+  ::arm_compute::Coordinates _axises;
+};
+
+#endif /*__SIMPLE_BATCH_TO_SPACE_ND_H__ */
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.cc
new file mode 100644
index 000000000..7c7706a78
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.cc
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "internal/layers/SimpleCastLayer.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void SimpleCastLayer::castData(::arm_compute::ITensor *in, ::arm_compute::ITensor *out,
+                               const arm_compute::Coordinates &id)
+{
+  switch (in->info()->data_type())
+  {
+    case ::arm_compute::DataType::F32:
+    {
+      copyCast(*reinterpret_cast<float *>(in->ptr_to_element(id)), out, id);
+      break;
+    }
+    case ::arm_compute::DataType::S32:
+    {
+      copyCast(*reinterpret_cast<int32_t *>(in->ptr_to_element(id)), out, id);
+      break;
+    }
+    case ::arm_compute::DataType::U32:
+    {
+      copyCast(*reinterpret_cast<uint32_t *>(in->ptr_to_element(id)), out, id);
+      break;
+    }
+    case ::arm_compute::DataType::QASYMM8:
+    {
+      const uint8_t quantizedValue = *(in->ptr_to_element(id));
+      copyCast(in->info()->quantization_info().dequantize(quantizedValue), out, id);
+      break;
+    }
+    default:
+      throw std::runtime_error("Not supported, yet");
+      break;
+  }
+}
+
+void SimpleCastLayer::configure(::arm_compute::ITensor *in, ::arm_compute::ITensor *out)
+{
+  _in = in;
+  _out = out;
+}
+
+void SimpleCastLayer::run(void)
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+    CAST_CL(_in)->map(q);
+    CAST_CL(_out)->map(q);
+  }
+
+  arm_compute::Window window;
+  window.use_tensor_dimensions(_out->info()->tensor_shape());
+
+  execute_window_loop(window,
+                      [this](const arm_compute::Coordinates &id) { castData(_in, _out, id); });
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+    CAST_CL(_out)->unmap(q);
+    CAST_CL(_in)->unmap(q);
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h
index fa3006438..f9a48b481 100644
--- a/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h
@@ -14,80 +14,55 @@
  * limitations under the License.
  */
 
+/**
+ * @file        SimpleCastLayer.h
+ * @brief       This file contains SimpleCastLayer class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
 #ifndef __SIMPLE_CAST_LAYER_H__
 #define __SIMPLE_CAST_LAYER_H__
 
-#include <arm_compute/core/ITensor.h>
-
 #include "internal/arm_compute.h"
-#include "internal/op/Cast.h"
+#include "internal/arm_compute/Cast.h"
 
+/**
+ * @brief Class to run SimpleCast Layer
+ */
 class SimpleCastLayer : public ::arm_compute::IFunction
 {
 public:
-  void configure(::arm_compute::ITensor *in, ::arm_compute::ITensor *out)
+  SimpleCastLayer(void) : _in(nullptr), _out(nullptr)
   {
-    _in = in;
-    _out = out;
+    // DO NOTHING
   }
 
-public:
-  void run(void) override
-  {
-    if (::internal::arm_compute::isGpuMode())
-    {
-      auto &q = ::arm_compute::CLScheduler::get().queue();
-      CAST_CL(_in)->map(q);
-      CAST_CL(_out)->map(q);
-    }
-
-    arm_compute::Window window;
-    window.use_tensor_dimensions(_out->info()->tensor_shape());
+  /**
+   * @brief Configure the layer
+   * @param[in] in The source tensor
+   * @param[in] out The destination tensor
+   * @return N/A
+   */
+  void configure(::arm_compute::ITensor *in, ::arm_compute::ITensor *out);
 
-    execute_window_loop(window,
-                        [this](const arm_compute::Coordinates &id) { castData(_in, _out, id); });
-
-    if (::internal::arm_compute::isGpuMode())
-    {
-      auto &q = ::arm_compute::CLScheduler::get().queue();
-      CAST_CL(_out)->unmap(q);
-      CAST_CL(_in)->unmap(q);
-    }
-  }
+  /**
+   * @brief Run the operation. Must be called after configure().
+   * @return N/A
+   */
+  void run(void) override;
 
+private:
+  /**
+   * @brief Cast and copy data from one tensor to another
+   *
+   * @param[in] in The source tensor
+   * @param[out] out The destination tensor
+   * @param[in] id Coordinates to copy
+   * @return N/A
+   */
   void castData(::arm_compute::ITensor *in, ::arm_compute::ITensor *out,
-                const arm_compute::Coordinates &id)
-  {
-    switch (in->info()->data_type())
-    {
-      case ::arm_compute::DataType::F32:
-      {
-        copyCast(*reinterpret_cast<float *>(in->ptr_to_element(id)), out, id);
-        break;
-      }
-      case ::arm_compute::DataType::S32:
-      {
-        copyCast(*reinterpret_cast<int32_t *>(in->ptr_to_element(id)), out, id);
-        break;
-      }
-      case ::arm_compute::DataType::U32:
-      {
-        copyCast(*reinterpret_cast<uint32_t *>(in->ptr_to_element(id)), out, id);
-        break;
-      }
-      case ::arm_compute::DataType::QASYMM8:
-      {
-        const uint8_t quantizedValue = *(in->ptr_to_element(id));
-        copyCast(in->info()->quantization_info().dequantize(quantizedValue), out, id);
-        break;
-      }
-      default:
-        throw std::runtime_error("Not supported, yet");
-        break;
-    }
-  }
+                const arm_compute::Coordinates &id);
 
-private:
   ::arm_compute::ITensor *_in;
   ::arm_compute::ITensor *_out;
 };
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleDepthToSpace.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleDepthToSpace.cc
new file mode 100644
index 000000000..d62a8321b
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleDepthToSpace.cc
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "internal/layers/SimpleDepthToSpace.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void SimpleDepthToSpace::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output,
+                                   int32_t block_size, const ::arm_compute::Coordinates &axises)
+{
+  const auto rank = axises.num_dimensions();
+  assert(rank == 4);
+  for (int i = 0; i < rank; ++i)
+  {
+    assert(axises[i] >= 0);
+    assert(axises[i] < rank);
+  }
+
+  _input = input;
+  _output = output;
+  _block_size = block_size;
+  _axises = axises;
+}
+
+template <typename T>
+inline void DepthToSpace(const ::arm_compute::ITensor *input,
+                         const ::arm_compute::TensorShape &input_shape, int32_t block_size,
+                         ::arm_compute::ITensor *output,
+                         const ::arm_compute::TensorShape &output_shape,
+                         const ::arm_compute::Coordinates &axises)
+{
+  const int output_batch = output_shape[axises[0]];
+  const int output_height = output_shape[axises[1]];
+  const int output_width = output_shape[axises[2]];
+  const int output_depth = output_shape[axises[3]];
+
+  for (int out_b = 0; out_b < output_batch; ++out_b)
+  {
+    for (int out_h = 0; out_h < output_height; ++out_h)
+    {
+      for (int out_w = 0; out_w < output_width; ++out_w)
+      {
+        for (int out_d = 0; out_d < output_depth; ++out_d)
+        {
+          const int in_b = out_b;
+          const int in_h = out_h / block_size;
+          const int in_w = out_w / block_size;
+          const int in_d =
+              out_d + ((out_h % block_size) * block_size + out_w % block_size) * output_depth;
+
+          auto input_id =
+              asARMComputeCoordinates(::arm_compute::Coordinates{in_b, in_h, in_w, in_d}, axises);
+          auto output_id = asARMComputeCoordinates(
+              ::arm_compute::Coordinates{out_b, out_h, out_w, out_d}, axises);
+
+          *reinterpret_cast<T *>(output->ptr_to_element(output_id)) =
+              *reinterpret_cast<T *>(input->ptr_to_element(input_id));
+        }
+      }
+    }
+  }
+}
+
+void SimpleDepthToSpace::run()
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->map(q);
+    CAST_CL(_output)->map(q);
+  }
+
+  switch (_input->info()->data_type())
+  {
+    case ::arm_compute::DataType::U8:
+    case ::arm_compute::DataType::QASYMM8:
+      DepthToSpace<uint8_t>(_input, _input->info()->tensor_shape(), _block_size, _output,
+                            _output->info()->tensor_shape(), _axises);
+      break;
+    case ::arm_compute::DataType::F32:
+      DepthToSpace<float>(_input, _input->info()->tensor_shape(), _block_size, _output,
+                          _output->info()->tensor_shape(), _axises);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("DataType not supported");
+      break;
+  }
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->unmap(q);
+    CAST_CL(_output)->unmap(q);
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleDepthToSpace.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleDepthToSpace.h
new file mode 100644
index 000000000..1032aaa47
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleDepthToSpace.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SIMPLE_DEPTH_TO_SPACE_H__
+#define __SIMPLE_DEPTH_TO_SPACE_H__
+
+#include "internal/arm_compute.h"
+#include "internal/arm_compute/Cast.h"
+
+class SimpleDepthToSpace : public ::arm_compute::IFunction
+{
+public:
+  SimpleDepthToSpace(void) : _input(nullptr), _output(nullptr), _block_size(0), _axises{}
+  {
+    // DO NOTHING
+  }
+
+public:
+  /** Initialise input and output
+   *
+   * @param[in]  input       First tensor input.
+   * @param[out] output      Output tensor.
+   * @param[in]  block_size  Block size.
+   * @param[in]  axises      Axises of rank 4
+   */
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output, int32_t block_size,
+                 const ::arm_compute::Coordinates &axises = getARMComputeAxises(4));
+
+  void run() override;
+
+private:
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_output;
+  int32_t _block_size;
+  ::arm_compute::Coordinates _axises;
+};
+
+#endif /*__SIMPLE_DEPTH_TO_SPACE_H__ */
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc
index 089c783c1..ae740bb10 100644
--- a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc
@@ -1,3 +1,18 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 #include "internal/layers/SimpleEmbeddingLookup.h"
 
 #include <arm_compute/runtime/CL/CLScheduler.h>
@@ -6,7 +21,8 @@ void SimpleEmbeddingLookup::configure(::arm_compute::ITensor *lookups,
                                       ::arm_compute::ITensor *values,
                                       ::arm_compute::ITensor *output)
 {
-  // Assume that verification of operands are already done at Planner::visit()
+  assert(values->info()->num_dimensions() == output->info()->num_dimensions());
+  assert(values->info()->num_dimensions() > 1 && values->info()->num_dimensions() <= 4);
   _lookups = lookups;
   _values = values;
   _output = output;
@@ -25,85 +41,62 @@ void SimpleEmbeddingLookup::run()
 
   // type of elements of lookups is always integer
   const int32_t *lookups_buf = reinterpret_cast<int32_t *>(_lookups->buffer());
-  const auto values_buf = _values->buffer();
-  auto output_buf = _output->buffer();
 
   const auto lookups_info = _lookups->info();
   const auto values_info = _values->info();
   const auto output_info = _output->info();
 
-  // TODO Refactor below duplicated code!
-  const auto values_rank = values_info->num_dimensions();
-  switch (values_rank)
+  // NOTE The first dimension's position is always at the end of dimensions.
+  const auto first_dim_pos = values_info->num_dimensions() - 1;
+
+  const size_t first_dim = values_info->dimension(first_dim_pos);
+  for (size_t i = 0; i < lookups_info->dimension(0); ++i)
   {
-    case 2:
-      // (H,W) in nnapi -> (W,H) in acl
-      {
-        const size_t row_size = values_info->dimension(1);
-        const size_t row_bytes = values_info->total_size() / row_size;
-        for (size_t i = 0; i < lookups_info->dimension(0); ++i)
-        {
-          if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size)
-            throw std::runtime_error("Embedding Lookup: index out of bounds.");
-
-          size_t idx = lookups_buf[i];
-          size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, idx});
-          size_t row_offset_by_i = output_info->offset_element_in_bytes({0, i});
-
-          unsigned char *sink_addr = output_buf + row_offset_by_i;
-          unsigned char *source_addr = values_buf + row_offset_by_idx;
-          memcpy(sink_addr, source_addr, row_bytes);
-        }
-      }
-      break;
-    case 3:
-      // (B,H,W) in nnapi -> (W,H,B) in acl
-      {
-        const size_t row_size = values_info->dimension(2);
-        const size_t row_bytes = values_info->total_size() / row_size;
-        for (size_t i = 0; i < lookups_info->dimension(0); ++i)
-        {
-          if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size)
-            throw std::runtime_error("Embedding Lookup: index out of bounds.");
-
-          size_t idx = lookups_buf[i];
-          size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, 0, idx});
-          size_t row_offset_by_i = output_info->offset_element_in_bytes({0, 0, i});
-
-          unsigned char *sink_addr = output_buf + row_offset_by_i;
-          unsigned char *source_addr = values_buf + row_offset_by_idx;
-          memcpy(sink_addr, source_addr, row_bytes);
-        }
-      }
-      break;
-    case 4:
-      // (N,H,W,C) in nnapi -> (N,C,H,W) in acl
-      {
-        const size_t row_size = values_info->dimension(3);
-        const size_t row_bytes = values_info->total_size() / row_size;
-        for (size_t i = 0; i < lookups_info->dimension(0); ++i)
-        {
-          if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size)
-            throw std::runtime_error("Embedding Lookup: index out of bounds.");
-
-          size_t idx = lookups_buf[i];
-          size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, 0, 0, idx});
-          size_t row_offset_by_i = output_info->offset_element_in_bytes({0, 0, 0, i});
-
-          unsigned char *sink_addr = output_buf + row_offset_by_i;
-          unsigned char *source_addr = values_buf + row_offset_by_idx;
-          memcpy(sink_addr, source_addr, row_bytes);
-        }
-      }
-      break;
-    case 1:
-      // In this case, shape of values actually is matrix but the height(row size) is 1 in acl. If
-      // row size is 1, this op is not needed and it means this situtation could be wrong.
-      throw std::runtime_error("Wrong usage of EmbeddingLookup op!");
-    default:
-      throw std::runtime_error("Not supported rank!");
+    if (lookups_buf[i] < 0 || lookups_buf[i] >= first_dim)
+      throw std::runtime_error("Embedding Lookup: index out of bounds.");
   }
 
+  // If each strides of values and output are different, applied padding size of the two tensors are
+  // different, therefore, it can not be copied at once.
+  auto can_copy_at_once = [&]() -> bool {
+    const auto &values_strides = values_info->strides_in_bytes();
+    const auto &output_strides = output_info->strides_in_bytes();
+
+    for (size_t i = 0; i < first_dim_pos; ++i)
+    {
+      if (values_strides[i] != values_strides[i])
+        return false;
+    }
+
+    return true;
+  };
+
+  using ::arm_compute::Window;
+  using ::arm_compute::Iterator;
+
+  size_t copy_bytes;
+  Window window;
+  if (can_copy_at_once())
+  {
+    copy_bytes = values_info->total_size() / first_dim;
+    window.use_tensor_dimensions(output_info->tensor_shape(), first_dim_pos);
+  }
+  else
+  {
+    copy_bytes = values_info->dimension(0) * values_info->element_size();
+    window.use_tensor_dimensions(output_info->tensor_shape(), Window::DimY);
+  }
+
+  Iterator it(_output, window);
+  execute_window_loop(window,
+                      [&](const ::arm_compute::Coordinates &id) {
+                        ::arm_compute::Coordinates values_id = id;
+                        const int idx = id[first_dim_pos];
+                        values_id.set(first_dim_pos, lookups_buf[idx]);
+                        memcpy(it.ptr(), _values->ptr_to_element(values_id), copy_bytes);
+                      },
+                      it);
+
   if (::internal::arm_compute::isGpuMode())
   {
     auto &q = ::arm_compute::CLScheduler::get().queue();
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h
index 9f2cd977f..fd499437f 100644
--- a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h
@@ -1,16 +1,55 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 #ifndef __SIMPLE_EMBEDDING_LOOKUP_H__
 #define __SIMPLE_EMBEDDING_LOOKUP_H__
 
 #include "internal/arm_compute.h"
-#include <arm_compute/core/ITensor.h>
-#include <arm_compute/runtime/IFunction.h>
 
+/**
+ * @file        SimpleEmbeddingLookup.h
+ * @brief       This file contains SimpleEmbeddingLookup class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
+/**
+ * @brief Class to run SimpleEmbeddingLookup Layer
+ */
 class SimpleEmbeddingLookup : public ::arm_compute::IFunction
 {
 public:
+  SimpleEmbeddingLookup(void) : _lookups(nullptr), _values(nullptr), _output(nullptr)
+  {
+    // DO NOTHING
+  }
+
+public:
+  /**
+   * @brief Configure the layer
+   * @param[in] lookups 1D tensor which contains lookup values
+   * @param[in] values The source tensor
+   * @param[in] output The destination tensor
+   * @return N/A
+   */
   void configure(::arm_compute::ITensor *lookups, ::arm_compute::ITensor *values,
                  ::arm_compute::ITensor *output);
 
+  /**
+   * @brief Run the operation. Must be called after configure().
+   * @return N/A
+   */
   void run() override;
 
 private:
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleHashtableLookupLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleHashtableLookupLayer.cc
new file mode 100644
index 000000000..7f8ae2505
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleHashtableLookupLayer.cc
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "internal/layers/SimpleHashtableLookupLayer.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void SimpleHashtableLookupLayer::configure(::arm_compute::ITensor *lookups,
+                                           ::arm_compute::ITensor *keys,
+                                           ::arm_compute::ITensor *values,
+                                           ::arm_compute::ITensor *output,
+                                           ::arm_compute::ITensor *hits)
+{
+  _lookups = lookups;
+  _keys = keys;
+  _values = values;
+  _output = output;
+  _hits = hits;
+  _lookup_indices.resize(lookups->info()->dimension(0), -1);
+}
+
+void SimpleHashtableLookupLayer::run()
+{
+  auto &queue = ::arm_compute::CLScheduler::get().queue();
+  if (::internal::arm_compute::isGpuMode())
+  {
+    CAST_CL(_lookups)->map(queue);
+    CAST_CL(_keys)->map(queue);
+    CAST_CL(_values)->map(queue);
+    CAST_CL(_output)->map(queue);
+    CAST_CL(_hits)->map(queue);
+  }
+
+  const int32_t *lookups_buf = reinterpret_cast<int32_t *>(_lookups->buffer());
+  const int32_t *keys_buf = reinterpret_cast<int32_t *>(_keys->buffer());
+  uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer());
+
+  const auto lookups_info = _lookups->info();
+  const auto values_info = _values->info();
+  const auto keys_info = _keys->info();
+  const auto output_info = _output->info();
+
+  // NOTE The first dimension's position must be always at the end of dimensions.
+  const auto first_dim_pos = values_info->num_dimensions() - 1;
+  const size_t first_dim = values_info->dimension(first_dim_pos);
+
+  std::map<int32_t, size_t> key_map;
+  const int keys_num = keys_info->dimension(0);
+  for (size_t key_index = 0; key_index < keys_num; key_index++)
+  {
+    key_map[keys_buf[key_index]] = key_index;
+  }
+
+  const int lookups_num = lookups_info->dimension(0);
+  for (size_t i = 0; i < lookups_num; ++i)
+  {
+    const auto lookup_value = lookups_buf[i];
+    const auto it = key_map.find(lookup_value);
+    if (it != key_map.end())
+    {
+      if (it->second >= first_dim)
+        throw std::runtime_error("HashTable Lookup: index out of bounds.");
+      _lookup_indices[i] = it->second;
+    }
+  }
+
+  // If each strides of values and output are different, applied padding size of the two tensors are
+  // different, therefore, it can not be copied at once.
+  auto can_copy_at_once = [&]() -> bool {
+    const auto &values_strides = values_info->strides_in_bytes();
+    const auto &output_strides = output_info->strides_in_bytes();
+
+    for (size_t i = 0; i < first_dim_pos; ++i)
+    {
+      if (values_strides[i] != values_strides[i])
+        return false;
+    }
+
+    return true;
+  };
+
+  using ::arm_compute::Window;
+  using ::arm_compute::Iterator;
+  using ::arm_compute::Coordinates;
+
+  size_t copy_bytes;
+  Window window;
+  if (can_copy_at_once())
+  {
+    copy_bytes = values_info->total_size() / first_dim;
+    window.use_tensor_dimensions(output_info->tensor_shape(), first_dim_pos);
+  }
+  else
+  {
+    copy_bytes = values_info->dimension(0) * values_info->element_size();
+    window.use_tensor_dimensions(output_info->tensor_shape(), Window::DimY);
+  }
+
+  Iterator it(_output, window);
+  execute_window_loop(window,
+                      [&](const Coordinates &id) {
+                        Coordinates values_id = id;
+                        const int idx = id[first_dim_pos];
+                        const int lookup_index = _lookup_indices[idx];
+                        if (lookup_index >= 0)
+                        {
+                          values_id.set(first_dim_pos, lookup_index);
+                          memcpy(it.ptr(), _values->ptr_to_element(values_id), copy_bytes);
+                          hits_buf[lookup_index] = 1;
+                        }
+                        else
+                        {
+                          memset(it.ptr(), 0, copy_bytes);
+                          hits_buf[lookup_index] = 0;
+                        }
+                      },
+                      it);
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    CAST_CL(_lookups)->unmap(queue);
+    CAST_CL(_keys)->unmap(queue);
+    CAST_CL(_values)->unmap(queue);
+    CAST_CL(_output)->unmap(queue);
+    CAST_CL(_hits)->unmap(queue);
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleHashtableLookupLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleHashtableLookupLayer.h
new file mode 100644
index 000000000..ba9d2ec0d
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleHashtableLookupLayer.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SIMPLE_HASHTABLE_LOOKUP_H__
+#define __SIMPLE_HASHTABLE_LOOKUP_H__
+
+#include "internal/arm_compute.h"
+
+class SimpleHashtableLookupLayer : public ::arm_compute::IFunction
+{
+public:
+  SimpleHashtableLookupLayer(void)
+      : _lookups(nullptr), _keys(nullptr), _values(nullptr), _output(nullptr), _hits(nullptr)
+  {
+    // DO NOTHING
+  }
+
+  void configure(::arm_compute::ITensor *lookups, ::arm_compute::ITensor *keys,
+                 ::arm_compute::ITensor *values, ::arm_compute::ITensor *output,
+                 ::arm_compute::ITensor *hits);
+
+  void run() override;
+
+private:
+  ::arm_compute::ITensor *_lookups;
+  ::arm_compute::ITensor *_keys;
+  ::arm_compute::ITensor *_values;
+  ::arm_compute::ITensor *_output;
+  ::arm_compute::ITensor *_hits;
+  std::vector<int32_t> _lookup_indices;
+};
+
+#endif /*__SIMPLE_HASHTABLE_LOOKUP_H__ */
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleNeg.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleNeg.cc
new file mode 100644
index 000000000..d3943ad40
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleNeg.cc
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "internal/layers/SimpleNeg.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void SimpleNeg::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output)
+{
+  _input = input;
+  _output = output;
+}
+
+void SimpleNeg::run()
+{
+  auto &queue = ::arm_compute::CLScheduler::get().queue();
+  if (::internal::arm_compute::isGpuMode())
+  {
+    CAST_CL(_input)->map(queue);
+    CAST_CL(_output)->map(queue);
+  }
+
+  arm_compute::Window window;
+  window.use_tensor_dimensions(_output->info()->tensor_shape());
+
+  execute_window_loop(window, [this](const arm_compute::Coordinates &id) {
+    // NOTE Must be two input tensors of identical type
+    //      Must be output tensor of the same type as input0.
+    assert(_input->info()->data_type() == _output->info()->data_type());
+
+    switch (_input->info()->data_type())
+    {
+      case ::arm_compute::DataType::F32:
+      {
+        const auto input_value = *reinterpret_cast<float *>(_input->ptr_to_element(id));
+        *reinterpret_cast<float *>(_output->ptr_to_element(id)) = -input_value;
+        break;
+      }
+      case ::arm_compute::DataType::S32:
+      {
+        const auto input_value = *reinterpret_cast<int32_t *>(_input->ptr_to_element(id));
+        *reinterpret_cast<int32_t *>(_output->ptr_to_element(id)) = -input_value;
+        break;
+      }
+      case ::arm_compute::DataType::U32:
+      {
+        const auto input_value = *reinterpret_cast<uint32_t *>(_input->ptr_to_element(id));
+        *reinterpret_cast<uint32_t *>(_output->ptr_to_element(id)) = -input_value;
+        break;
+      }
+      default:
+        throw std::runtime_error("Not supported, yet");
+        break;
+    }
+  });
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    CAST_CL(_input)->unmap(queue);
+    CAST_CL(_output)->unmap(queue);
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleNeg.h
index cb3f36337..4ca88e7f8 100644
--- a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleNeg.h
@@ -1,41 +1,39 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PAD_LAYER_H__
-#define __PAD_LAYER_H__
-
-#include <arm_compute/runtime/CL/CLTensor.h>
-#include <arm_compute/runtime/CL/functions/CLFillBorder.h>
-
-class PadLayer : public ::arm_compute::IFunction
-{
-public:
-  void configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output,
-                 unsigned int border_width);
-  void run(void) override;
-
-private:
-  ::arm_compute::ICLTensor *_input;
-  ::arm_compute::ICLTensor *_output;
-  int _border_width;
-  int _output_height;
-  int _output_width;
-
-  ::arm_compute::CLFillBorder _fillborderkernel;
-  void populateOutput();
-};
-
-#endif // __PAD_LAYER_H__
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SIMPLE_NEG_H__
+#define __SIMPLE_NEG_H__
+
+#include "internal/arm_compute.h"
+
+class SimpleNeg : public ::arm_compute::IFunction
+{
+public:
+  SimpleNeg(void) : _input(nullptr), _output(nullptr)
+  {
+    // DO NOTHING
+  }
+
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output);
+
+  void run() override;
+
+private:
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_output;
+};
+
+#endif /*__SIMPLE_NEG_H__ */
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimplePackLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/SimplePackLayer.cc
new file mode 100644
index 000000000..2a0a25f0c
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimplePackLayer.cc
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "internal/arm_compute.h"
+#include "SimplePackLayer.h"
+
+void SimplePackLayer::configure(const std::vector<::arm_compute::ICLTensor *> &input_vector,
+                                ::arm_compute::ICLTensor *output, int32_t axis)
+{
+  uint32_t nr_inputs = input_vector.size();
+  uint32_t output_rank = output->info()->num_dimensions();
+  const ::arm_compute::PermutationVector pv{1, 2, 0};
+  _cl_permuted_vector.resize(nr_inputs);
+  _cl_permute_vector.resize(nr_inputs);
+
+  _output = output;
+  // A negative axis implies axis from the end.
+  // For example, axis = -1 implies the first axis from the end, i.e. axis = Rank - 1.
+  // Similarly, axis = -2 imples second axis from the end, i.e. axis = Rank - 2.
+  if (axis < 0)
+  {
+    axis += output_rank;
+  }
+  _axis = ToARMComputeAxis(output_rank, axis).value();
+  _cl_reshape_vector.resize(nr_inputs);
+
+  ::arm_compute::TensorShape subTensor_shape{};
+  for (int i = 0; i < output_rank; i++)
+  {
+    if (i != _axis)
+    {
+      subTensor_shape.set(i, _output->info()->tensor_shape()[i]);
+    }
+    else
+    {
+      subTensor_shape.set(i, 1);
+    }
+  }
+
+  auto subTensor_offset = ::arm_compute::Coordinates{};
+  subTensor_offset.set_num_dimensions(output_rank);
+
+  for (int i = 0; i < input_vector.size(); i++)
+  {
+    _input_vector.push_back(input_vector[i]);
+    subTensor_offset[_axis] = i;
+    auto temp_tensor = std::make_shared<::arm_compute::CLSubTensor>(
+        CAST_CL(_output), subTensor_shape, subTensor_offset, true);
+    _sub_tensor_vector.push_back(temp_tensor);
+    // configure to resize of input tensor in sub tensor offseted, dimension expansion will be
+    // automatic
+    _cl_permute_vector[i].configure(CAST_CL(_input_vector[i]), &_cl_permuted_vector[i], pv);
+    _cl_reshape_vector[i].configure(&_cl_permuted_vector[i], _sub_tensor_vector[i].get());
+    _cl_permuted_vector[i].allocator()->allocate();
+  }
+}
+
+void SimplePackLayer::run(void)
+{
+  for (int i = 0; i < _input_vector.size(); i++)
+  {
+    _cl_permute_vector[i].run();
+    _cl_reshape_vector[i].run();
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimplePackLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimplePackLayer.h
new file mode 100644
index 000000000..2c2fc37f2
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimplePackLayer.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SIMPLE_PACK_LAYER_H__
+#define __SIMPLE_PACK_LAYER_H__
+
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/CLSubTensor.h>
+#include <arm_compute/runtime/CL/functions/CLReshapeLayer.h>
+#include <arm_compute/runtime/CL/functions/CLPermute.h>
+
+class SimplePackLayer : public ::arm_compute::IFunction
+{
+public:
+  SimplePackLayer(void)
+      : _cl_permuted_vector{}, _input_vector{}, _sub_tensor_vector{}, _cl_reshape_vector{},
+        _cl_permute_vector{}, _output(nullptr), _axis(0)
+  {
+    // DO NOTHING
+  }
+
+public:
+  void configure(const std::vector<::arm_compute::ICLTensor *> &input_vector,
+                 ::arm_compute::ICLTensor *output, int axis);
+
+public:
+  void run(void) override;
+
+private:
+  std::vector<::arm_compute::CLTensor> _cl_permuted_vector;
+  std::vector<::arm_compute::ICLTensor *> _input_vector;
+  std::vector<std::shared_ptr<::arm_compute::CLSubTensor>> _sub_tensor_vector;
+  std::vector<::arm_compute::CLReshapeLayer> _cl_reshape_vector;
+  std::vector<::arm_compute::CLPermute> _cl_permute_vector;
+  ::arm_compute::ICLTensor *_output;
+  int _axis;
+};
+
+#endif // __SIMPLE_PACK_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.cc
new file mode 100644
index 000000000..64236603f
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.cc
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "internal/layers/SimplePadLayer.h"
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+namespace
+{
+bool validate_arg(const ::arm_compute::ITensor *input, const ::arm_compute::ITensor *output,
+                  const ::arm_compute::ITensor *padding_size,
+                  const ::arm_compute::Coordinates &axises)
+{
+  const int input_batch = input->info()->tensor_shape()[axises[0]];
+  const int input_height = input->info()->tensor_shape()[axises[1]];
+  const int input_width = input->info()->tensor_shape()[axises[2]];
+  const int input_depth = input->info()->tensor_shape()[axises[3]];
+
+  const int output_batch = output->info()->tensor_shape()[axises[0]];
+  const int output_height = output->info()->tensor_shape()[axises[1]];
+  const int output_width = output->info()->tensor_shape()[axises[2]];
+  const int output_depth = output->info()->tensor_shape()[axises[3]];
+
+  auto pad_batch_up = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 0}));
+  auto pad_batch_down = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({1, 0}));
+  auto pad_height_top = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 1}));
+  auto pad_height_bottom = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({1, 1}));
+  auto pad_width_left = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 2}));
+  auto pad_width_right = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({1, 2}));
+  auto pad_depth_front = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 3}));
+  auto pad_depth_back = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({1, 3}));
+
+  const int padded_batch = input_batch + pad_batch_up + pad_batch_down;
+  const int padded_height = input_height + pad_height_top + pad_height_bottom;
+  const int padded_width = input_width + pad_width_left + pad_width_right;
+  const int padded_depth = input_depth + pad_depth_front + pad_depth_back;
+
+  return (padded_batch == output_batch) && (padded_height == output_height) &&
+         (padded_width == output_width) && (padded_depth == output_depth);
+}
+} // namespace
+
+void SimplePadLayer::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output,
+                               ::arm_compute::ITensor *padding_size,
+                               const ::arm_compute::Coordinates &axises)
+{
+
+  const auto rank = axises.num_dimensions();
+  assert(rank == 4);
+  assert(input != nullptr && output != nullptr && padding_size != nullptr);
+
+  for (int i = 0; i < rank; ++i)
+  {
+    assert(axises[i] >= 0);
+    assert(axises[i] < rank);
+  }
+
+  _input = input;
+  _output = output;
+  _padding_size = padding_size;
+  _axises = axises;
+}
+
+template <typename T>
+inline void ApplyPadding(const ::arm_compute::ITensor *input_data,
+                         const ::arm_compute::TensorShape &input_shape,
+                         const ::arm_compute::ITensor *padding_size,
+                         ::arm_compute::ITensor *output_data,
+                         const ::arm_compute::TensorShape &output_shape,
+                         const ::arm_compute::Coordinates &axises, T zero_value)
+{
+
+  assert(validate_arg(input_data, output_data, padding_size, axises) &&
+         "Padded Input shape does not match to output shape");
+
+  const int input_batch = input_shape[axises[0]];
+  const int input_height = input_shape[axises[1]];
+  const int input_width = input_shape[axises[2]];
+  const int input_depth = input_shape[axises[3]];
+
+  const int output_batch = output_shape[axises[0]];
+  const int output_height = output_shape[axises[1]];
+  const int output_width = output_shape[axises[2]];
+  const int output_depth = output_shape[axises[3]];
+
+  // Padding size for Up, Top, Left and Front are required.
+  auto pad_batch_up = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 0}));
+  auto pad_height_top = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 1}));
+  auto pad_width_left = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 2}));
+  auto pad_depth_front = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 3}));
+
+  for (int out_b = 0; out_b < output_batch; ++out_b)
+  {
+    for (int out_h = 0; out_h < output_height; ++out_h)
+    {
+      for (int out_w = 0; out_w < output_width; ++out_w)
+      {
+        for (int out_d = 0; out_d < output_depth; ++out_d)
+        {
+          auto output_id = asARMComputeCoordinates(
+              ::arm_compute::Coordinates{out_b, out_h, out_w, out_d}, axises);
+
+          if (out_b < pad_batch_up || out_b >= (input_batch + pad_batch_up) ||
+              out_h < pad_height_top || out_h >= (input_height + pad_height_top) ||
+              out_w < pad_width_left || out_w >= (input_width + pad_width_left) ||
+              out_d < pad_depth_front || out_d >= (input_depth + pad_depth_front))
+          {
+            *reinterpret_cast<T *>(output_data->ptr_to_element(output_id)) = zero_value;
+          }
+          else
+          {
+            auto input_id = asARMComputeCoordinates(
+                ::arm_compute::Coordinates{out_b - pad_batch_up, out_h - pad_height_top,
+                                           out_w - pad_width_left, out_d - pad_depth_front},
+                axises);
+            *reinterpret_cast<T *>(output_data->ptr_to_element(output_id)) =
+                *reinterpret_cast<T *>(input_data->ptr_to_element(input_id));
+          }
+        }
+      }
+    }
+  }
+}
+void SimplePadLayer::run()
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->map(q);
+    CAST_CL(_output)->map(q);
+    CAST_CL(_padding_size)->map(q);
+  }
+
+  switch (_input->info()->data_type())
+  {
+    case ::arm_compute::DataType::U8:
+    case ::arm_compute::DataType::QASYMM8:
+      ApplyPadding<uint8_t>(_input, _input->info()->tensor_shape(), _padding_size, _output,
+                            _output->info()->tensor_shape(), _axises,
+                            _input->info()->quantization_info().offset);
+      break;
+    case ::arm_compute::DataType::F32:
+      ApplyPadding<float>(_input, _input->info()->tensor_shape(), _padding_size, _output,
+                          _output->info()->tensor_shape(), _axises, 0.0f);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("DataType not supported");
+      break;
+  }
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->unmap(q);
+    CAST_CL(_output)->unmap(q);
+    CAST_CL(_padding_size)->unmap(q);
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.h
new file mode 100644
index 000000000..8cb6659ce
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SIMPLE_PAD_LAYER_H__
+#define __SIMPLE_PAD_LAYER_H__
+
+#include "internal/arm_compute.h"
+#include "internal/arm_compute/Cast.h"
+
+class SimplePadLayer : public ::arm_compute::IFunction
+{
+public:
+  SimplePadLayer(void) : _input(nullptr), _output(nullptr), _padding_size(nullptr), _axises{}
+  {
+    // DO NOTHING
+  }
+
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output,
+                 ::arm_compute::ITensor *padding_size,
+                 const ::arm_compute::Coordinates &axises = getARMComputeAxises(4));
+
+  void run(void) override;
+
+private:
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_output;
+  ::arm_compute::ITensor *_padding_size;
+  ::arm_compute::Coordinates _axises;
+};
+
+#endif // __SIMPLE_PAD_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSQRT.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleSQRT.cc
new file mode 100644
index 000000000..b5b3a0950
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSQRT.cc
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "internal/layers/SimpleSQRT.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void SimpleSQRT::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output)
+{
+  _input = input;
+  _output = output;
+}
+
+void SimpleSQRT::run()
+{
+  auto &queue = ::arm_compute::CLScheduler::get().queue();
+  if (::internal::arm_compute::isGpuMode())
+  {
+    CAST_CL(_input)->map(queue);
+    CAST_CL(_output)->map(queue);
+  }
+
+  arm_compute::Window window;
+  window.use_tensor_dimensions(_output->info()->tensor_shape());
+
+  execute_window_loop(window, [this](const arm_compute::Coordinates &id) {
+    // NOTE Must be two input tensors of identical type
+    //      Must be output tensor of the same type as input0.
+    assert(_input->info()->data_type() == _output->info()->data_type());
+
+    const auto input_value = *reinterpret_cast<float *>(_input->ptr_to_element(id));
+    *reinterpret_cast<float *>(_output->ptr_to_element(id)) = sqrt(input_value);
+  });
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    CAST_CL(_input)->unmap(queue);
+    CAST_CL(_output)->unmap(queue);
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSQRT.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleSQRT.h
new file mode 100644
index 000000000..b05a9e32e
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSQRT.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SIMPLE_SQRT_H__
+#define __SIMPLE_SQRT_H__
+
+#include "internal/arm_compute.h"
+
+class SimpleSQRT : public ::arm_compute::IFunction
+{
+public:
+  SimpleSQRT(void) : _input(nullptr), _output(nullptr)
+  {
+    // DO NOTHING
+  }
+
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output);
+
+  void run() override;
+
+private:
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_output;
+};
+
+#endif /*__SIMPLE_SQRT_H__ */
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToBatchND.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToBatchND.cc
new file mode 100644
index 000000000..f53675b99
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToBatchND.cc
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "internal/layers/SimpleSpaceToBatchND.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void SimpleSpaceToBatchND::configure(::arm_compute::ITensor *input,
+                                     ::arm_compute::ITensor *block_size,
+                                     ::arm_compute::ITensor *padding_size,
+                                     ::arm_compute::ITensor *output)
+{
+  const auto rank = input->info()->num_dimensions();
+  assert(rank == 4);
+
+  _input = input;
+  _block_size = block_size;
+  _padding_size = padding_size;
+  _output = output;
+}
+
+template <typename T>
+inline void
+SpaceToBatchND(const ::arm_compute::ITensor *input, const ::arm_compute::TensorShape &input_shape,
+               const ::arm_compute::ITensor *block_size, const ::arm_compute::ITensor *padding_size,
+               const ::arm_compute::ITensor *output, const ::arm_compute::TensorShape &output_shape,
+               T zero_value)
+{
+  const int input_batch = input_shape[3];
+  const int input_height = input_shape[1];
+  const int input_width = input_shape[0];
+
+  const int depth = output_shape[2];
+
+  const int padding_height_left = *reinterpret_cast<int *>(padding_size->ptr_to_element({0, 1}));
+  const int padding_height_right = *reinterpret_cast<int *>(padding_size->ptr_to_element({1, 1}));
+  const int padding_width_left = *reinterpret_cast<int *>(padding_size->ptr_to_element({0, 0}));
+  const int padding_width_right = *reinterpret_cast<int *>(padding_size->ptr_to_element({1, 0}));
+  const int padded_height = input_height + padding_height_left + padding_height_right;
+  const int padded_width = input_width + padding_width_left + padding_width_right;
+
+  const int block_size_height = *reinterpret_cast<int *>(block_size->ptr_to_element({1}));
+  const int block_size_width = *reinterpret_cast<int *>(block_size->ptr_to_element({0}));
+
+  assert(padding_height_left >= 0);
+  assert(padding_height_right >= 0);
+  assert(padding_width_left >= 0);
+  assert(padding_width_right >= 0);
+  assert(block_size_height >= 1);
+  assert(block_size_width >= 1);
+  assert(padded_height % block_size_height == 0);
+  assert(padded_width % block_size_width == 0);
+  assert(output->info()->dimension(3) ==
+         input->info()->dimension(3) * (block_size_height * block_size_width));
+
+  for (int in_b = 0; in_b < input_batch; ++in_b)
+  {
+    for (int in_d = 0; in_d < depth; ++in_d)
+    {
+      for (int in_h = 0; in_h < padded_height; ++in_h)
+      {
+        for (int in_w = 0; in_w < padded_width; ++in_w)
+        {
+          const int out_d = in_d;
+          const int out_h = in_h / block_size_height;
+          const int out_w = in_w / block_size_width;
+          const int out_b =
+              in_b +
+              ((in_h % block_size_height) * block_size_width + in_w % block_size_width) *
+                  input_batch;
+
+          const ::arm_compute::Coordinates output_id{out_w, out_h, out_d, out_b};
+
+          if (in_h < padding_height_left || in_h >= (input_height + padding_height_left) ||
+              in_w < padding_width_left || in_w >= (input_width + padding_width_left))
+          {
+            *reinterpret_cast<T *>(output->ptr_to_element(output_id)) = zero_value;
+          }
+          else
+          {
+            const ::arm_compute::Coordinates input_id{in_w - padding_width_left,
+                                                      in_h - padding_height_left, in_d, in_b};
+            *reinterpret_cast<T *>(output->ptr_to_element(output_id)) =
+                *reinterpret_cast<T *>(input->ptr_to_element(input_id));
+          }
+        }
+      }
+    }
+  }
+}
+void SimpleSpaceToBatchND::run()
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->map(q);
+    CAST_CL(_block_size)->map(q);
+    CAST_CL(_padding_size)->map(q);
+    CAST_CL(_output)->map(q);
+  }
+
+  switch (_input->info()->data_type())
+  {
+    case ::arm_compute::DataType::U8:
+    case ::arm_compute::DataType::QASYMM8:
+      SpaceToBatchND<uint8_t>(_input, _input->info()->tensor_shape(), _block_size, _padding_size,
+                              _output, _output->info()->tensor_shape(),
+                              _input->info()->quantization_info().offset);
+      break;
+    case ::arm_compute::DataType::F32:
+      SpaceToBatchND<float>(_input, _input->info()->tensor_shape(), _block_size, _padding_size,
+                            _output, _output->info()->tensor_shape(), 0.0f);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("DataType not supported");
+      break;
+  }
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->unmap(q);
+    CAST_CL(_block_size)->unmap(q);
+    CAST_CL(_padding_size)->unmap(q);
+    CAST_CL(_output)->unmap(q);
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToBatchND.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToBatchND.h
new file mode 100644
index 000000000..4af961d34
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToBatchND.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SIMPLE_SPACE_TO_BATCHND_H__
+#define __SIMPLE_SPACE_TO_BATCHND_H__
+
+#include "internal/arm_compute.h"
+
+class SimpleSpaceToBatchND : public ::arm_compute::IFunction
+{
+public:
+  SimpleSpaceToBatchND(void)
+      : _input(nullptr), _block_size(nullptr), _padding_size(nullptr), _output(nullptr)
+  {
+    // DO NOTHING
+  }
+
+  /** Initialise input and output
+   *
+   * @param[in]  input        First tensor input.
+   * @param[in]  block_size   Block size.
+   * @param[in]  padding_size Padding size.
+   * @param[out] output       Output tensor.
+   */
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *block_size,
+                 ::arm_compute::ITensor *padding_size, ::arm_compute::ITensor *output);
+
+  void run() override;
+
+private:
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_block_size;
+  ::arm_compute::ITensor *_padding_size;
+  ::arm_compute::ITensor *_output;
+};
+
+#endif /*__SIMPLE_SPACE_TO_BATCHND_H__ */
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc
index 682295f81..3519da1f3 100644
--- a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc
@@ -19,11 +19,8 @@
 #include <arm_compute/runtime/CL/CLScheduler.h>
 
 void SimpleSpaceToDepth::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output,
-                                   int32_t block_size,
-                                   const ::arm_compute::Coordinates &axises = {3, 1, 0, 2})
+                                   int32_t block_size, const ::arm_compute::Coordinates &axises)
 {
-  assert(input->info()->num_dimensions() == 4);
-  assert(output->info()->num_dimensions() == 4);
   const auto rank = axises.num_dimensions();
   assert(rank == 4);
   for (int i = 0; i < rank; ++i)
@@ -38,26 +35,10 @@ void SimpleSpaceToDepth::configure(::arm_compute::ITensor *input, ::arm_compute:
   _axises = axises;
 }
 
-inline int32_t Offset4D(const ::arm_compute::TensorShape &shape, int32_t b, int32_t h, int32_t w,
-                        int32_t d, const ::arm_compute::Coordinates &axises)
-{
-  // b, h, w, d >= 0
-  size_t indexes[4];
-  indexes[axises[0]] = b;
-  indexes[axises[1]] = h;
-  indexes[axises[2]] = w;
-  indexes[axises[3]] = d;
-
-  int32_t offset = indexes[3] * shape[2] * shape[1] * shape[0];
-  offset += indexes[2] * shape[1] * shape[0];
-  offset += indexes[1] * shape[0];
-  offset += indexes[0];
-  return offset;
-}
-
 template <typename T>
-inline void SpaceToDepth(const T *input_data, const ::arm_compute::TensorShape &input_shape,
-                         int32_t block_size, T *output_data,
+inline void SpaceToDepth(const ::arm_compute::ITensor *input,
+                         const ::arm_compute::TensorShape &input_shape, int32_t block_size,
+                         ::arm_compute::ITensor *output,
                          const ::arm_compute::TensorShape &output_shape,
                          const ::arm_compute::Coordinates &axises)
 {
@@ -66,16 +47,6 @@ inline void SpaceToDepth(const T *input_data, const ::arm_compute::TensorShape &
   const int input_width = input_shape[axises[2]];
   const int input_depth = input_shape[axises[3]];
 
-  const int output_batch = output_shape[axises[0]];
-  const int output_height = output_shape[axises[1]];
-  const int output_width = output_shape[axises[2]];
-  const int output_depth = output_shape[axises[3]];
-
-  assert(input_batch == output_batch);
-  assert(input_height == output_height * block_size);
-  assert(input_width == output_width * block_size);
-  assert(input_depth * block_size * block_size == output_depth);
-
   for (int in_b = 0; in_b < input_batch; ++in_b)
   {
     for (int in_h = 0; in_h < input_height; ++in_h)
@@ -90,10 +61,13 @@ inline void SpaceToDepth(const T *input_data, const ::arm_compute::TensorShape &
           const int out_d =
               in_d + ((in_h % block_size) * block_size + in_w % block_size) * input_depth;
 
-          const int input_index = Offset4D(input_shape, in_b, in_h, in_w, in_d, axises);
-          const int output_index = Offset4D(output_shape, out_b, out_h, out_w, out_d, axises);
+          auto input_id =
+              asARMComputeCoordinates(::arm_compute::Coordinates{in_b, in_h, in_w, in_d}, axises);
+          auto output_id = asARMComputeCoordinates(
+              ::arm_compute::Coordinates{out_b, out_h, out_w, out_d}, axises);
 
-          output_data[output_index] = input_data[input_index];
+          *reinterpret_cast<T *>(output->ptr_to_element(output_id)) =
+              *reinterpret_cast<T *>(input->ptr_to_element(input_id));
         }
       }
     }
@@ -110,35 +84,16 @@ void SimpleSpaceToDepth::run()
     CAST_CL(_output)->map(q);
   }
 
-  auto input_buf = _input->buffer();
-  auto output_buf = _output->buffer();
   switch (_input->info()->data_type())
   {
     case ::arm_compute::DataType::U8:
     case ::arm_compute::DataType::QASYMM8:
-      SpaceToDepth(reinterpret_cast<const uint8_t *>(input_buf), _input->info()->tensor_shape(),
-                   _block_size, reinterpret_cast<uint8_t *>(output_buf),
-                   _output->info()->tensor_shape(), _axises);
-      break;
-    case ::arm_compute::DataType::S8:
-      SpaceToDepth(reinterpret_cast<const int8_t *>(input_buf), _input->info()->tensor_shape(),
-                   _block_size, reinterpret_cast<int8_t *>(output_buf),
-                   _output->info()->tensor_shape(), _axises);
-      break;
-    case ::arm_compute::DataType::U32:
-      SpaceToDepth(reinterpret_cast<const uint32_t *>(input_buf), _input->info()->tensor_shape(),
-                   _block_size, reinterpret_cast<uint32_t *>(output_buf),
-                   _output->info()->tensor_shape(), _axises);
-      break;
-    case ::arm_compute::DataType::S32:
-      SpaceToDepth(reinterpret_cast<const int32_t *>(input_buf), _input->info()->tensor_shape(),
-                   _block_size, reinterpret_cast<int32_t *>(output_buf),
-                   _output->info()->tensor_shape(), _axises);
+      SpaceToDepth<uint8_t>(_input, _input->info()->tensor_shape(), _block_size, _output,
+                            _output->info()->tensor_shape(), _axises);
       break;
     case ::arm_compute::DataType::F32:
-      SpaceToDepth(reinterpret_cast<const float *>(input_buf), _input->info()->tensor_shape(),
-                   _block_size, reinterpret_cast<float *>(output_buf),
-                   _output->info()->tensor_shape(), _axises);
+      SpaceToDepth<float>(_input, _input->info()->tensor_shape(), _block_size, _output,
+                          _output->info()->tensor_shape(), _axises);
       break;
     default:
       ARM_COMPUTE_ERROR("DataType not supported");
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h
index f5e028b1c..9e87c364c 100644
--- a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h
@@ -14,25 +14,44 @@
  * limitations under the License.
  */
 
+/**
+ * @file        SimpleSpaceToDepth.h
+ * @brief       This file contains SimpleSpaceToDepth class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
 #ifndef __SIMPLE_SPACE_TO_DEPTH_H__
 #define __SIMPLE_SPACE_TO_DEPTH_H__
 
 #include "internal/arm_compute.h"
-#include <arm_compute/core/ITensor.h>
-#include <arm_compute/runtime/IFunction.h>
+#include "internal/arm_compute/Cast.h"
 
+/**
+ * @brief Class to run SimpleEmbeddingLookup Layer
+ */
 class SimpleSpaceToDepth : public ::arm_compute::IFunction
 {
 public:
-  /** Initialise input and output
-   *
-   * @param[in]  input       First tensor input.
-   * @param[out] output      Output tensor.
-   * @param[in]  block_size  Block size.
+  SimpleSpaceToDepth(void) : _input(nullptr), _output(nullptr), _block_size(0), _axises{}
+  {
+    // DO NOTHING
+  }
+
+  /**
+   * @brief Configure the layer
+   * @param[in] input       First tensor input.
+   * @param[in] output      Output tensor.
+   * @param[in] block_size  Block size.
+   * @param[in] axises      Axises of rank 4
+   * @return N/A
    */
   void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output, int32_t block_size,
-                 const ::arm_compute::Coordinates &axises);
+                 const ::arm_compute::Coordinates &axises = getARMComputeAxises(4));
 
+  /**
+   * @brief Run the operation. Must be called after configure().
+   * @return N/A
+   */
   void run() override;
 
 private:
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleTransposeConv.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleTransposeConv.cc
new file mode 100644
index 000000000..abc291289
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleTransposeConv.cc
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "internal/layers/SimpleTransposeConv.h"
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void SimpleTransposeConv::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *weights,
+                                    ::arm_compute::ITensor *output,
+                                    ::arm_compute::PadStrideInfo &tconv_info,
+                                    ::arm_compute::Coordinates axises)
+{
+  auto rank = axises.num_dimensions();
+
+  assert(rank == 4);
+
+  _input = input;
+  _weights = weights;
+  _output = output;
+  _stride_width = tconv_info.stride().first;
+  _stride_height = tconv_info.stride().second;
+  _pad_width = tconv_info.pad_left();
+  _pad_height = tconv_info.pad_top();
+  _axises = axises;
+}
+
+template <typename T>
+inline void ApplyTransposeConv(
+    const ::arm_compute::TensorShape &input_shape, const ::arm_compute::ITensor *input_data,
+    const ::arm_compute::TensorShape &filter_shape, const ::arm_compute::ITensor *filter_data,
+    const ::arm_compute::TensorShape &output_shape, const ::arm_compute::ITensor *output_data,
+    const int32_t stride_width, const int32_t stride_height, const int32_t pad_width,
+    const int32_t pad_height, const ::arm_compute::Coordinates axises)
+{
+  const int batches = input_shape[axises[0]];
+  const int input_height = input_shape[axises[1]];
+  const int input_width = input_shape[axises[2]];
+  const int input_depth = input_shape[axises[3]];
+
+  const int filter_height = filter_shape[axises[1]];
+  const int filter_width = filter_shape[axises[2]];
+
+  const int output_height = output_shape[axises[1]];
+  const int output_width = output_shape[axises[2]];
+  const int output_depth = output_shape[axises[3]];
+
+  assert(batches == output_shape[axises[0]]);
+  assert(input_depth == filter_shape[axises[3]]);
+  assert(filter_shape[axises[0]] == output_depth);
+
+  // Although transpose convolution simplifies to convolution with transposed
+  // weights for strides of 1, non-unitary striding complicates matters. To
+  // keep this reference implementation as clear as possible, we use a
+  // "scatter" access pattern, where we loop through all the input elements,
+  // computing their influence on the output, rather than looping through the
+  // output elements in the typical "gather" access pattern of a conv. We
+  // therefore must initialize the output array to zero.
+
+  // Loop through input elements one at a time.
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int in_y = 0; in_y < input_height; ++in_y)
+    {
+      for (int in_x = 0; in_x < input_width; ++in_x)
+      {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+        {
+          // Loop through the output elements it will influence
+          const int out_x_origin = (in_x * stride_width) - pad_width;
+          const int out_y_origin = (in_y * stride_height) - pad_height;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+              {
+                // Compute output element location
+                const int out_x = out_x_origin + filter_x;
+                const int out_y = out_y_origin + filter_y;
+                // We cannot accumulate out of bounds
+                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+                    (out_y < output_height))
+                {
+                  auto input_id = asARMComputeCoordinates(
+                      ::arm_compute::Coordinates{batch, in_y, in_x, in_channel}, axises);
+                  auto filter_id = asARMComputeCoordinates(
+                      ::arm_compute::Coordinates{in_channel, filter_y, filter_x, out_channel},
+                      axises);
+                  auto output_id = asARMComputeCoordinates(
+                      ::arm_compute::Coordinates{batch, out_y, out_x, out_channel}, axises);
+                  T input_value = *reinterpret_cast<T *>(input_data->ptr_to_element(input_id));
+                  T filter_value = *reinterpret_cast<T *>(filter_data->ptr_to_element(filter_id));
+                  *reinterpret_cast<T *>(output_data->ptr_to_element(output_id)) +=
+                      input_value * filter_value;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void SimpleTransposeConv::run()
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->map(q);
+    CAST_CL(_weights)->map(q);
+    CAST_CL(_output)->map(q);
+  }
+
+  switch (_input->info()->data_type())
+  {
+    case ::arm_compute::DataType::S32:
+      ApplyTransposeConv<int32_t>(_input->info()->tensor_shape(), _input,
+                                  _weights->info()->tensor_shape(), _weights,
+                                  _output->info()->tensor_shape(), _output, _stride_width,
+                                  _stride_height, _pad_width, _pad_height, _axises);
+      break;
+    case ::arm_compute::DataType::F32:
+      ApplyTransposeConv<float>(_input->info()->tensor_shape(), _input,
+                                _weights->info()->tensor_shape(), _weights,
+                                _output->info()->tensor_shape(), _output, _stride_width,
+                                _stride_height, _pad_width, _pad_height, _axises);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("DataType not supported");
+      break;
+  }
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->unmap(q);
+    CAST_CL(_weights)->unmap(q);
+    CAST_CL(_output)->unmap(q);
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleTransposeConv.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleTransposeConv.h
new file mode 100644
index 000000000..c5519828b
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleTransposeConv.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __TRANSPOSE_CONV_EX__
+#define __TRANSPOSE_CONV_EX__
+
+#include "internal/arm_compute.h"
+#include "internal/arm_compute/Cast.h"
+
+class SimpleTransposeConv : public ::arm_compute::IFunction
+{
+public:
+  SimpleTransposeConv()
+      : _input(nullptr), _weights(nullptr), _output(nullptr), _stride_width(0), _stride_height(0),
+        _pad_width(0), _pad_height(0)
+  {
+    // DO NOTHING
+  }
+
+  /** Initialise input and output
+   *
+   * @param[in]  input              First tensor input.
+   * @param[in]  weights            Weights
+   * @param[out] output             Output tensor.
+   * @param[in]  tc_info            Contains padding and policies to be used in the deconvolution,
+   *                                this is decribed in @ref PadStrideInfo.
+   * @param[in]  axises             Axises of rank 4
+   */
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *weights,
+                 ::arm_compute::ITensor *output, ::arm_compute::PadStrideInfo &tconv_info,
+                 ::arm_compute::Coordinates axises = getARMComputeAxises(4));
+
+  void run() override;
+
+private:
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_weights;
+  ::arm_compute::ITensor *_output;
+  int32_t _stride_width;
+  int32_t _stride_height;
+  int32_t _pad_width;
+  int32_t _pad_height;
+  ::arm_compute::Coordinates _axises;
+};
+
+#endif /*__TRANSPOSE_CONV_EX__ */
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.cc
new file mode 100644
index 000000000..910595a44
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.cc
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "internal/arm_compute.h"
+#include "SimpleUnpackLayer.h"
+
+void SimpleUnpackLayer::configure(::arm_compute::ICLTensor *input,
+                                  const std::vector<::arm_compute::ICLTensor *> &output_vector,
+                                  int32_t axis)
+{
+  uint32_t nr_outputs = output_vector.size();
+  _cl_permuted_vector.resize(nr_outputs);
+  _cl_permute_vector.resize(nr_outputs);
+  uint32_t input_rank = input->info()->num_dimensions();
+  const ::arm_compute::PermutationVector pv{2, 0, 1};
+  _input = input;
+  // Negatige axis is supported, -1 implies R-1 axis where R is input rank
+  if (axis < 0)
+  {
+    axis += input_rank;
+  }
+  _axis = ToARMComputeAxis(input_rank, axis).value();
+  _cl_reshape_vector.resize(nr_outputs);
+
+  ::arm_compute::TensorShape subTensor_shape{};
+  for (int i = 0; i < input_rank; i++)
+  {
+    if (i != _axis)
+    {
+      subTensor_shape.set(i, _input->info()->tensor_shape()[i]);
+    }
+    else
+    {
+      subTensor_shape.set(i, 1);
+    }
+  }
+
+  auto subTensor_offset = ::arm_compute::Coordinates{};
+  subTensor_offset.set_num_dimensions(input_rank);
+
+  for (int i = 0; i < output_vector.size(); i++)
+  {
+    _output_vector.push_back(output_vector[i]);
+    subTensor_offset[_axis] = i;
+    auto temp_tensor = std::make_shared<::arm_compute::CLSubTensor>(
+        CAST_CL(_input), subTensor_shape, subTensor_offset, true);
+    _sub_tensor_vector.push_back(temp_tensor);
+    // Copies into the subtensor
+    _cl_permute_vector[i].configure(_sub_tensor_vector[i].get(), &_cl_permuted_vector[i], pv);
+    _cl_reshape_vector[i].configure(&_cl_permuted_vector[i], CAST_CL(_output_vector[i]));
+    _cl_permuted_vector[i].allocator()->allocate();
+  }
+}
+
+void SimpleUnpackLayer::run(void)
+{
+  for (int i = 0; i < _output_vector.size(); i++)
+  {
+    _cl_permute_vector[i].run();
+    _cl_reshape_vector[i].run();
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.h
new file mode 100644
index 000000000..52fc7513d
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __UNPACK_LAYER_H__
+#define __UNPACK_LAYER_H__
+
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/CLSubTensor.h>
+#include <arm_compute/runtime/CL/functions/CLReshapeLayer.h>
+#include <arm_compute/runtime/CL/functions/CLPermute.h>
+
+class SimpleUnpackLayer : public ::arm_compute::IFunction
+{
+public:
+  SimpleUnpackLayer(void)
+      : _cl_permuted_vector{}, _output_vector{}, _sub_tensor_vector{}, _cl_reshape_vector{},
+        _cl_permute_vector{}, _input(nullptr), _axis(0)
+  {
+    // DO NOTHING
+  }
+
+public:
+  void configure(::arm_compute::ICLTensor *input,
+                 const std::vector<::arm_compute::ICLTensor *> &output_vector, int32_t axis);
+
+public:
+  void run(void) override;
+
+private:
+  std::vector<::arm_compute::CLTensor> _cl_permuted_vector;
+  std::vector<::arm_compute::ICLTensor *> _output_vector;
+  std::vector<std::shared_ptr<::arm_compute::CLSubTensor>> _sub_tensor_vector;
+  std::vector<::arm_compute::CLReshapeLayer> _cl_reshape_vector;
+  std::vector<::arm_compute::CLPermute> _cl_permute_vector;
+  ::arm_compute::ICLTensor *_input;
+  int32_t _axis;
+};
+
+#endif // __UNPACK_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.cc b/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.cc
deleted file mode 100644
index 3f988a819..000000000
--- a/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "SquaredDifferenceOperation.h"
-#include "internal/arm_compute.h"
-
-void SquaredDifferenceOperation::configure(::arm_compute::ITensor *input1,
-                                           ::arm_compute::ITensor *input2,
-                                           ::arm_compute::ITensor *output,
-                                           ::arm_compute::ConvertPolicy ConvertPolicy, float scale,
-                                           ::arm_compute::RoundingPolicy RoundingPolicy)
-{
-  _input1 = input1;
-  _input2 = input2;
-  _output = output;
-
-  if (::internal::arm_compute::isGpuMode())
-  {
-    _cl_sub.configure(CAST_CL(input1), CAST_CL(input2), CAST_CL(output), ConvertPolicy);
-    _cl_mul.configure(CAST_CL(output), CAST_CL(output), CAST_CL(output), scale, ConvertPolicy,
-                      RoundingPolicy);
-  }
-  else
-  {
-    _neon_sub.configure(CAST_NE(input1), CAST_NE(input2), CAST_NE(output), ConvertPolicy);
-    _neon_mul.configure(CAST_NE(output), CAST_NE(output), CAST_NE(output), scale, ConvertPolicy,
-                        RoundingPolicy);
-  }
-}
-
-void SquaredDifferenceOperation::run(void)
-{
-  if (::internal::arm_compute::isGpuMode())
-  {
-    _cl_sub.run();
-    _cl_mul.run();
-  }
-  else
-  {
-    _neon_sub.run();
-    _neon_mul.run();
-  }
-}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.h b/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.h
deleted file mode 100644
index 3782c4e8c..000000000
--- a/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef __SQUARED_DIFFERENCE_OPERATION_H__
-#define __SQUARED_DIFFERENCE_OPERATION_H__
-
-#include <arm_compute/runtime/Tensor.h>
-#include <arm_compute/runtime/CL/CLTensor.h>
-
-#include <arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h>
-#include <arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h>
-#include <arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h>
-#include <arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h>
-
-class SquaredDifferenceOperation : public ::arm_compute::IFunction
-{
-public:
-  void configure(::arm_compute::ITensor *input1, ::arm_compute::ITensor *input2,
-                 ::arm_compute::ITensor *output, ::arm_compute::ConvertPolicy ConvertPolicy,
-                 float scale, ::arm_compute::RoundingPolicy RoundingPolicy);
-
-public:
-  void run(void) override;
-
-private:
-  ::arm_compute::ITensor *_input1;
-  ::arm_compute::ITensor *_input2;
-
-  ::arm_compute::ITensor *_output;
-
-private:
-  ::arm_compute::CLArithmeticSubtraction _cl_sub;
-  ::arm_compute::CLPixelWiseMultiplication _cl_mul;
-
-  ::arm_compute::NEArithmeticSubtraction _neon_sub;
-  ::arm_compute::NEPixelWiseMultiplication _neon_mul;
-};
-#endif // __SQUARED_DIFFERENCE_OPERATION_H__