15 files changed, 1081 insertions, 0 deletions
diff --git a/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h b/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h
new file mode 100644
index 000000000..502a1ee0e
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FEATURE_LOGGING_LAYER_H__
+#define __FEATURE_LOGGING_LAYER_H__
+
+#include <arm_compute/core/ITensor.h>
+#include <arm_compute/runtime/IFunction.h>
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+#include <iostream>
+#include <iomanip>
+#include <limits>
+
+#include "internal/arm_compute.h"
+
+class FeatureLoggingLayer : public ::arm_compute::IFunction
+{
+public:
+  void configure(const std::string &tag, ::arm_compute::ITensor *target)
+  {
+    _tag = tag;
+    _target = target;
+  }
+
+public:
+  void run(void) override
+  {
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto &q = ::arm_compute::CLScheduler::get().queue();
+      CAST_CL(_target)->map(q);
+    }
+
+    const size_t W = _target->info()->dimension(0);
+    const size_t H = _target->info()->dimension(1);
+    const size_t C = _target->info()->dimension(2);
+
+    std::cout << _tag << std::endl;
+
+    for (size_t ch = 0; ch < C; ++ch)
+    {
+      std::cout << "Channel #" << ch << std::endl;
+      for (size_t row = 0; row < H; ++row)
+      {
+        for (size_t col = 0; col < W; ++col)
+        {
+          const arm_compute::Coordinates id{col, row, ch};
+          const auto value = *reinterpret_cast<float *>(_target->ptr_to_element(id));
+
+          // TODO Generalize this to integer types
+          std::cout << std::setprecision(2);
+          std::cout << std::setw(7);
+          std::cout << std::setfill(' ');
+          std::cout << std::fixed;
+          std::cout << value << " ";
+        }
+        std::cout << std::endl;
+      }
+      std::cout << std::endl;
+    }
+
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto &q = ::arm_compute::CLScheduler::get().queue();
+      CAST_CL(_target)->unmap(q);
+    }
+  }
+
+private:
+  std::string _tag;
+  ::arm_compute::ITensor *_target;
+};
+
+#endif // __FEATURE_LOGGING_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc
new file mode 100644
index 000000000..311284efc
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenericFullyConnectedLayer.h"
+#include "internal/arm_compute.h"
+
+#include <arm_compute/core/Helpers.h>
+
+void GenericFullyConnectedLayer::configure(::arm_compute::ITensor *input,
+                                           ::arm_compute::ITensor *weights,
+                                           ::arm_compute::ITensor *biases,
+                                           ::arm_compute::ITensor *output, bool needs_reshape,
+                                           ::arm_compute::TensorShape reshape)
+{
+  _input = input;
+  _weights = weights;
+  _biases = biases;
+  _output = output;
+  _needs_reshape = needs_reshape;
+
+  // TODO Too many duplicated code. Revise below code.
+  if (::internal::arm_compute::isGpuMode())
+  {
+    if (_needs_reshape)
+    {
+      // reshape
+      auto_init_if_empty(*_cl_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
+      _generic_reshape.configure(CAST_CL(_input), &_cl_buffer);
+
+      _cl_fc.configure(&_cl_buffer, CAST_CL(_weights), CAST_CL(_biases), CAST_CL(_output));
+
+      // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
+      _cl_buffer.allocator()->allocate();
+    }
+    else
+    {
+      _cl_fc.configure(CAST_CL(_input), CAST_CL(_weights), CAST_CL(_biases), CAST_CL(_output));
+    }
+  }
+  else
+  {
+    if (_needs_reshape)
+    {
+      // reshape
+      auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
+      _generic_reshape.configure(CAST_NE(_input), &_neon_buffer);
+
+      _neon_fc.configure(&_neon_buffer, CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output));
+
+      // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate
+      // here.
+      _neon_buffer.allocator()->allocate();
+    }
+    else
+    {
+      _neon_fc.configure(CAST_NE(_input), CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output));
+    }
+  }
+}
+
+void GenericFullyConnectedLayer::run(void)
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    if (_needs_reshape)
+      _generic_reshape.run();
+
+    _cl_fc.run();
+  }
+  else
+  {
+    if (_needs_reshape)
+      _generic_reshape.run();
+
+    _neon_fc.run();
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h
new file mode 100644
index 000000000..55d8683da
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __GENERIC_FULLY_CONNECTED_LAYER_H__
+#define __GENERIC_FULLY_CONNECTED_LAYER_H__
+
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
+#include "internal/layers/GenericReshapeLayer.h"
+
+class GenericFullyConnectedLayer : public ::arm_compute::IFunction
+{
+public:
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *weights,
+                 ::arm_compute::ITensor *biases, ::arm_compute::ITensor *output, bool needs_reshape,
+                 ::arm_compute::TensorShape reshape);
+
+public:
+  void run(void) override;
+
+private:
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_weights;
+  ::arm_compute::ITensor *_biases;
+  ::arm_compute::ITensor *_output;
+
+  // buffer for reshaping input tensor
+  ::arm_compute::CLTensor _cl_buffer;
+  ::arm_compute::Tensor _neon_buffer;
+
+private:
+  ::arm_compute::CLFullyConnectedLayer _cl_fc;
+  ::arm_compute::NEFullyConnectedLayer _neon_fc;
+  GenericReshapeLayer _generic_reshape;
+  bool _needs_reshape;
+};
+
+#endif // __GENERIC_FULLY_CONNECTED_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc
new file mode 100644
index 000000000..2cdfe1b6e
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenericReshapeLayer.h"
+#include "internal/arm_compute.h"
+
+void GenericReshapeLayer::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output)
+{
+  _input = input;
+  _output = output;
+
+  // NOTE This vector comes from CLPermuteKernel implementation
+  //
+  // This implementation permutes a tensor of shape W / H / C into another tensor of shape C / W / H
+  //
+  //     Original | Permuted
+  // 0 | W        | C (from 2)
+  // 1 | H        | W (from 0)
+  // 2 | C        | H (from 1)
+  //
+  const ::arm_compute::PermutationVector pv{2, 0, 1};
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    _cl_permute.configure(CAST_CL(input), &_cl_permuted, pv);
+    _cl_reshape.configure(&_cl_permuted, CAST_CL(output));
+
+    // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+    _cl_permuted.allocator()->allocate();
+  }
+  else
+  {
+    _neon_permute.configure(CAST_NE(input), &_neon_permuted, pv);
+    _neon_reshape.configure(&_neon_permuted, CAST_NE(output));
+
+    // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+    _neon_permuted.allocator()->allocate();
+  }
+}
+
+void GenericReshapeLayer::run(void)
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    _cl_permute.run();
+    _cl_reshape.run();
+  }
+  else
+  {
+    _neon_permute.run();
+    _neon_reshape.run();
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h
new file mode 100644
index 000000000..1def21085
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __GENERIC_RESHAPE_LAYER_H__
+#define __GENERIC_RESHAPE_LAYER_H__
+
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+
+#include <arm_compute/runtime/CL/functions/CLPermute.h>
+#include <arm_compute/runtime/CL/functions/CLReshapeLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEPermute.h>
+#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>
+
+class GenericReshapeLayer : public ::arm_compute::IFunction
+{
+public:
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output);
+
+public:
+  void run(void) override;
+
+private:
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_output;
+  ::arm_compute::CLTensor _cl_permuted;
+  ::arm_compute::Tensor _neon_permuted;
+
+private:
+  ::arm_compute::CLPermute _cl_permute;
+  ::arm_compute::CLReshapeLayer _cl_reshape;
+
+  ::arm_compute::NEPermute _neon_permute;
+  ::arm_compute::NEReshapeLayer _neon_reshape;
+};
+
+#endif // __GENERIC_RESHAPE_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc
new file mode 100644
index 000000000..4a5370587
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include "PadLayer.h"
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void PadLayer::configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output,
+                         unsigned int border_width)
+{
+  _input = input;
+  _output = output;
+  _border_width = border_width;
+  _output_height = _output->info()->dimension(0);
+  _output_width = _output->info()->dimension(1);
+
+  uint8_t constant_border_value = 0;
+  ::arm_compute::PixelValue constant_pixel_value = ::arm_compute::PixelValue(constant_border_value);
+
+  unsigned int padding_size = _border_width;
+  input->info()->extend_padding(::arm_compute::PaddingSize{padding_size});
+  _fillborderkernel.configure(input, _border_width, ::arm_compute::BorderMode::CONSTANT,
+                              constant_pixel_value);
+}
+
+void PadLayer::run(void)
+{
+  _fillborderkernel.run();
+
+  ::arm_compute::Coordinates coordinates =
+      ::arm_compute::Coordinates(-_border_width, -_border_width);
+  ::arm_compute::TensorShape new_tensor_shape =
+      ::arm_compute::TensorShape(_output_height, _output_width);
+
+  /* NOTE: The cl kernel fills the data in the borders(not in the tensor).
+           Once the tensor is received back at NNAPI, we are adjusting
+           the valid region in such a way that the padding becomes part of the tensor itself
+           and matches the size of output. */
+  _input->info()->set_valid_region(::arm_compute::ValidRegion(coordinates, new_tensor_shape));
+
+  /* NOTE: Since cl kernel does not have an argument for output tensor while NNAPI does.
+           We need to map the input (tensor that is passed to the cl kernel) back to
+           output. */
+
+  // TODO: Write a modified CLCopy kernel to do this job.
+  populateOutput();
+}
+
+void PadLayer::populateOutput()
+{
+  auto &queue = ::arm_compute::CLScheduler::get().queue();
+  _input->map(queue);
+  _output->map(queue);
+
+  auto input_tensor = static_cast<::arm_compute::ITensor *>(_input);
+  auto const source_data = input_tensor->buffer();
+
+  auto output_tensor = static_cast<::arm_compute::ITensor *>(_output);
+  auto dst_data = output_tensor->buffer();
+
+  memmove(dst_data, source_data, _output_height * _output_width * 4);
+
+  _input->unmap(queue);
+  _output->unmap(queue);
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h
new file mode 100644
index 000000000..cb3f36337
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PAD_LAYER_H__
+#define __PAD_LAYER_H__
+
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/functions/CLFillBorder.h>
+
+class PadLayer : public ::arm_compute::IFunction
+{
+public:
+  void configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output,
+                 unsigned int border_width);
+  void run(void) override;
+
+private:
+  ::arm_compute::ICLTensor *_input;
+  ::arm_compute::ICLTensor *_output;
+  int _border_width;
+  int _output_height;
+  int _output_width;
+
+  ::arm_compute::CLFillBorder _fillborderkernel;
+  void populateOutput();
+};
+
+#endif // __PAD_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h
new file mode 100644
index 000000000..31c927b4f
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SIMPLE_ARITHMETIC_ADDITION_H__
+#define __SIMPLE_ARITHMETIC_ADDITION_H__
+
+#include "internal/arm_compute.h"
+#include <arm_compute/core/ITensor.h>
+
+class SimpleArithmeticAddition : public ::arm_compute::IFunction
+{
+public:
+  void configure(::arm_compute::ITensor *lhs, ::arm_compute::ITensor *rhs,
+                 ::arm_compute::ITensor *out)
+  {
+    _lhs = lhs;
+    _rhs = rhs;
+    _out = out;
+  }
+
+public:
+  void run(void) override
+  {
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto &q = ::arm_compute::CLScheduler::get().queue();
+
+      CAST_CL(_lhs)->map(q);
+      CAST_CL(_rhs)->map(q);
+      CAST_CL(_out)->map(q);
+    }
+
+    arm_compute::Window window;
+    window.use_tensor_dimensions(_out->info()->tensor_shape());
+
+    execute_window_loop(window, [this](const arm_compute::Coordinates &id) {
+      // NOTE Must be two input tensors of identical type
+      //      Must be output tensor of the same type as input0.
+      assert(_lhs->info()->data_type() == _rhs->info()->data_type());
+      assert(_lhs->info()->data_type() == _out->info()->data_type());
+
+      switch (_lhs->info()->data_type())
+      {
+        case ::arm_compute::DataType::F32:
+        {
+          const auto lhs_value = *reinterpret_cast<float *>(_lhs->ptr_to_element(id));
+          const auto rhs_value = *reinterpret_cast<float *>(_rhs->ptr_to_element(id));
+          *reinterpret_cast<float *>(_out->ptr_to_element(id)) = lhs_value + rhs_value;
+          break;
+        }
+        case ::arm_compute::DataType::S32:
+        {
+          const auto lhs_value = *reinterpret_cast<int32_t *>(_lhs->ptr_to_element(id));
+          const auto rhs_value = *reinterpret_cast<int32_t *>(_rhs->ptr_to_element(id));
+          *reinterpret_cast<int32_t *>(_out->ptr_to_element(id)) = lhs_value + rhs_value;
+          break;
+        }
+        case ::arm_compute::DataType::U32:
+        {
+          const auto lhs_value = *reinterpret_cast<uint32_t *>(_lhs->ptr_to_element(id));
+          const auto rhs_value = *reinterpret_cast<uint32_t *>(_rhs->ptr_to_element(id));
+          *reinterpret_cast<uint32_t *>(_out->ptr_to_element(id)) = lhs_value + rhs_value;
+          break;
+        }
+        case ::arm_compute::DataType::QASYMM8:
+        {
+          const auto lhs_value = *reinterpret_cast<uint8_t *>(_lhs->ptr_to_element(id));
+          const auto rhs_value = *reinterpret_cast<uint8_t *>(_rhs->ptr_to_element(id));
+          // How to handle with overflow?
+          *reinterpret_cast<uint8_t *>(_out->ptr_to_element(id)) = lhs_value + rhs_value;
+          break;
+        }
+        default:
+          throw std::runtime_error("Not supported, yet");
+          break;
+      }
+    });
+
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto &q = ::arm_compute::CLScheduler::get().queue();
+
+      CAST_CL(_out)->unmap(q);
+      CAST_CL(_rhs)->unmap(q);
+      CAST_CL(_lhs)->unmap(q);
+    }
+  }
+
+private:
+  ::arm_compute::ITensor *_lhs;
+  ::arm_compute::ITensor *_rhs;
+  ::arm_compute::ITensor *_out;
+};
+
+#endif // __SIMPLE_ARITHMETIC_ADDITION_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h
new file mode 100644
index 000000000..fa3006438
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SIMPLE_CAST_LAYER_H__
+#define __SIMPLE_CAST_LAYER_H__
+
+#include <arm_compute/core/ITensor.h>
+
+#include "internal/arm_compute.h"
+#include "internal/op/Cast.h"
+
+class SimpleCastLayer : public ::arm_compute::IFunction
+{
+public:
+  void configure(::arm_compute::ITensor *in, ::arm_compute::ITensor *out)
+  {
+    _in = in;
+    _out = out;
+  }
+
+public:
+  void run(void) override
+  {
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto &q = ::arm_compute::CLScheduler::get().queue();
+      CAST_CL(_in)->map(q);
+      CAST_CL(_out)->map(q);
+    }
+
+    arm_compute::Window window;
+    window.use_tensor_dimensions(_out->info()->tensor_shape());
+
+    execute_window_loop(window,
+                        [this](const arm_compute::Coordinates &id) { castData(_in, _out, id); });
+
+    if (::internal::arm_compute::isGpuMode())
+    {
+      auto &q = ::arm_compute::CLScheduler::get().queue();
+      CAST_CL(_out)->unmap(q);
+      CAST_CL(_in)->unmap(q);
+    }
+  }
+
+  void castData(::arm_compute::ITensor *in, ::arm_compute::ITensor *out,
+                const arm_compute::Coordinates &id)
+  {
+    switch (in->info()->data_type())
+    {
+      case ::arm_compute::DataType::F32:
+      {
+        copyCast(*reinterpret_cast<float *>(in->ptr_to_element(id)), out, id);
+        break;
+      }
+      case ::arm_compute::DataType::S32:
+      {
+        copyCast(*reinterpret_cast<int32_t *>(in->ptr_to_element(id)), out, id);
+        break;
+      }
+      case ::arm_compute::DataType::U32:
+      {
+        copyCast(*reinterpret_cast<uint32_t *>(in->ptr_to_element(id)), out, id);
+        break;
+      }
+      case ::arm_compute::DataType::QASYMM8:
+      {
+        const uint8_t quantizedValue = *(in->ptr_to_element(id));
+        copyCast(in->info()->quantization_info().dequantize(quantizedValue), out, id);
+        break;
+      }
+      default:
+        throw std::runtime_error("Not supported, yet");
+        break;
+    }
+  }
+
+private:
+  ::arm_compute::ITensor *_in;
+  ::arm_compute::ITensor *_out;
+};
+
+#endif // __SIMPLE_CAST_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc
new file mode 100644
index 000000000..089c783c1
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc
@@ -0,0 +1,115 @@
+#include "internal/layers/SimpleEmbeddingLookup.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void SimpleEmbeddingLookup::configure(::arm_compute::ITensor *lookups,
+                                      ::arm_compute::ITensor *values,
+                                      ::arm_compute::ITensor *output)
+{
+  // Assume that verification of operands are already done at Planner::visit()
+  _lookups = lookups;
+  _values = values;
+  _output = output;
+}
+
+void SimpleEmbeddingLookup::run()
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_lookups)->map(q);
+    CAST_CL(_values)->map(q);
+    CAST_CL(_output)->map(q);
+  }
+
+  // type of elements of lookups is always integer
+  const int32_t *lookups_buf = reinterpret_cast<int32_t *>(_lookups->buffer());
+  const auto values_buf = _values->buffer();
+  auto output_buf = _output->buffer();
+
+  const auto lookups_info = _lookups->info();
+  const auto values_info = _values->info();
+  const auto output_info = _output->info();
+
+  // TODO Refactor below duplicated code!
+  const auto values_rank = values_info->num_dimensions();
+  switch (values_rank)
+  {
+    case 2:
+      // (H,W) in nnapi -> (W,H) in acl
+      {
+        const size_t row_size = values_info->dimension(1);
+        const size_t row_bytes = values_info->total_size() / row_size;
+        for (size_t i = 0; i < lookups_info->dimension(0); ++i)
+        {
+          if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size)
+            throw std::runtime_error("Embedding Lookup: index out of bounds.");
+
+          size_t idx = lookups_buf[i];
+          size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, idx});
+          size_t row_offset_by_i = output_info->offset_element_in_bytes({0, i});
+
+          unsigned char *sink_addr = output_buf + row_offset_by_i;
+          unsigned char *source_addr = values_buf + row_offset_by_idx;
+          memcpy(sink_addr, source_addr, row_bytes);
+        }
+      }
+      break;
+    case 3:
+      // (B,H,W) in nnapi -> (W,H,B) in acl
+      {
+        const size_t row_size = values_info->dimension(2);
+        const size_t row_bytes = values_info->total_size() / row_size;
+        for (size_t i = 0; i < lookups_info->dimension(0); ++i)
+        {
+          if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size)
+            throw std::runtime_error("Embedding Lookup: index out of bounds.");
+
+          size_t idx = lookups_buf[i];
+          size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, 0, idx});
+          size_t row_offset_by_i = output_info->offset_element_in_bytes({0, 0, i});
+
+          unsigned char *sink_addr = output_buf + row_offset_by_i;
+          unsigned char *source_addr = values_buf + row_offset_by_idx;
+          memcpy(sink_addr, source_addr, row_bytes);
+        }
+      }
+      break;
+    case 4:
+      // (N,H,W,C) in nnapi -> (N,C,H,W) in acl
+      {
+        const size_t row_size = values_info->dimension(3);
+        const size_t row_bytes = values_info->total_size() / row_size;
+        for (size_t i = 0; i < lookups_info->dimension(0); ++i)
+        {
+          if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size)
+            throw std::runtime_error("Embedding Lookup: index out of bounds.");
+
+          size_t idx = lookups_buf[i];
+          size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, 0, 0, idx});
+          size_t row_offset_by_i = output_info->offset_element_in_bytes({0, 0, 0, i});
+
+          unsigned char *sink_addr = output_buf + row_offset_by_i;
+          unsigned char *source_addr = values_buf + row_offset_by_idx;
+          memcpy(sink_addr, source_addr, row_bytes);
+        }
+      }
+      break;
+    case 1:
+      // In this case, shape of values actually is matrix but the height(row size) is 1 in acl. If
+      // row size is 1, this op is not needed and it means this situtation could be wrong.
+      throw std::runtime_error("Wrong usage of EmbeddingLookup op!");
+    default:
+      throw std::runtime_error("Not supported rank!");
+  }
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_lookups)->unmap(q);
+    CAST_CL(_values)->unmap(q);
+    CAST_CL(_output)->unmap(q);
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h
new file mode 100644
index 000000000..9f2cd977f
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h
@@ -0,0 +1,22 @@
+#ifndef __SIMPLE_EMBEDDING_LOOKUP_H__
+#define __SIMPLE_EMBEDDING_LOOKUP_H__
+
+#include "internal/arm_compute.h"
+#include <arm_compute/core/ITensor.h>
+#include <arm_compute/runtime/IFunction.h>
+
+class SimpleEmbeddingLookup : public ::arm_compute::IFunction
+{
+public:
+  void configure(::arm_compute::ITensor *lookups, ::arm_compute::ITensor *values,
+                 ::arm_compute::ITensor *output);
+
+  void run() override;
+
+private:
+  ::arm_compute::ITensor *_lookups;
+  ::arm_compute::ITensor *_values;
+  ::arm_compute::ITensor *_output;
+};
+
+#endif /*__SIMPLE_EMBEDDING_LOOKUP_H__ */
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc
new file mode 100644
index 000000000..682295f81
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "internal/layers/SimpleSpaceToDepth.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void SimpleSpaceToDepth::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output,
+                                   int32_t block_size,
+                                   const ::arm_compute::Coordinates &axises = {3, 1, 0, 2})
+{
+  assert(input->info()->num_dimensions() == 4);
+  assert(output->info()->num_dimensions() == 4);
+  const auto rank = axises.num_dimensions();
+  assert(rank == 4);
+  for (int i = 0; i < rank; ++i)
+  {
+    assert(axises[i] >= 0);
+    assert(axises[i] < rank);
+  }
+
+  _input = input;
+  _output = output;
+  _block_size = block_size;
+  _axises = axises;
+}
+
+inline int32_t Offset4D(const ::arm_compute::TensorShape &shape, int32_t b, int32_t h, int32_t w,
+                        int32_t d, const ::arm_compute::Coordinates &axises)
+{
+  // b, h, w, d >= 0
+  size_t indexes[4];
+  indexes[axises[0]] = b;
+  indexes[axises[1]] = h;
+  indexes[axises[2]] = w;
+  indexes[axises[3]] = d;
+
+  int32_t offset = indexes[3] * shape[2] * shape[1] * shape[0];
+  offset += indexes[2] * shape[1] * shape[0];
+  offset += indexes[1] * shape[0];
+  offset += indexes[0];
+  return offset;
+}
+
+template <typename T>
+inline void SpaceToDepth(const T *input_data, const ::arm_compute::TensorShape &input_shape,
+                         int32_t block_size, T *output_data,
+                         const ::arm_compute::TensorShape &output_shape,
+                         const ::arm_compute::Coordinates &axises)
+{
+  const int input_batch = input_shape[axises[0]];
+  const int input_height = input_shape[axises[1]];
+  const int input_width = input_shape[axises[2]];
+  const int input_depth = input_shape[axises[3]];
+
+  const int output_batch = output_shape[axises[0]];
+  const int output_height = output_shape[axises[1]];
+  const int output_width = output_shape[axises[2]];
+  const int output_depth = output_shape[axises[3]];
+
+  assert(input_batch == output_batch);
+  assert(input_height == output_height * block_size);
+  assert(input_width == output_width * block_size);
+  assert(input_depth * block_size * block_size == output_depth);
+
+  for (int in_b = 0; in_b < input_batch; ++in_b)
+  {
+    for (int in_h = 0; in_h < input_height; ++in_h)
+    {
+      for (int in_w = 0; in_w < input_width; ++in_w)
+      {
+        for (int in_d = 0; in_d < input_depth; ++in_d)
+        {
+          const int out_b = in_b;
+          const int out_h = in_h / block_size;
+          const int out_w = in_w / block_size;
+          const int out_d =
+              in_d + ((in_h % block_size) * block_size + in_w % block_size) * input_depth;
+
+          const int input_index = Offset4D(input_shape, in_b, in_h, in_w, in_d, axises);
+          const int output_index = Offset4D(output_shape, out_b, out_h, out_w, out_d, axises);
+
+          output_data[output_index] = input_data[input_index];
+        }
+      }
+    }
+  }
+}
+
+void SimpleSpaceToDepth::run()
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->map(q);
+    CAST_CL(_output)->map(q);
+  }
+
+  auto input_buf = _input->buffer();
+  auto output_buf = _output->buffer();
+  switch (_input->info()->data_type())
+  {
+    case ::arm_compute::DataType::U8:
+    case ::arm_compute::DataType::QASYMM8:
+      SpaceToDepth(reinterpret_cast<const uint8_t *>(input_buf), _input->info()->tensor_shape(),
+                   _block_size, reinterpret_cast<uint8_t *>(output_buf),
+                   _output->info()->tensor_shape(), _axises);
+      break;
+    case ::arm_compute::DataType::S8:
+      SpaceToDepth(reinterpret_cast<const int8_t *>(input_buf), _input->info()->tensor_shape(),
+                   _block_size, reinterpret_cast<int8_t *>(output_buf),
+                   _output->info()->tensor_shape(), _axises);
+      break;
+    case ::arm_compute::DataType::U32:
+      SpaceToDepth(reinterpret_cast<const uint32_t *>(input_buf), _input->info()->tensor_shape(),
+                   _block_size, reinterpret_cast<uint32_t *>(output_buf),
+                   _output->info()->tensor_shape(), _axises);
+      break;
+    case ::arm_compute::DataType::S32:
+      SpaceToDepth(reinterpret_cast<const int32_t *>(input_buf), _input->info()->tensor_shape(),
+                   _block_size, reinterpret_cast<int32_t *>(output_buf),
+                   _output->info()->tensor_shape(), _axises);
+      break;
+    case ::arm_compute::DataType::F32:
+      SpaceToDepth(reinterpret_cast<const float *>(input_buf), _input->info()->tensor_shape(),
+                   _block_size, reinterpret_cast<float *>(output_buf),
+                   _output->info()->tensor_shape(), _axises);
+      break;
+    default:
+      ARM_COMPUTE_ERROR("DataType not supported");
+      break;
+  }
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_input)->unmap(q);
+    CAST_CL(_output)->unmap(q);
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h
new file mode 100644
index 000000000..f5e028b1c
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SIMPLE_SPACE_TO_DEPTH_H__
+#define __SIMPLE_SPACE_TO_DEPTH_H__
+
+#include "internal/arm_compute.h"
+#include <arm_compute/core/ITensor.h>
+#include <arm_compute/runtime/IFunction.h>
+
+class SimpleSpaceToDepth : public ::arm_compute::IFunction
+{
+public:
+  /** Initialise input and output
+   *
+   * @param[in]  input       First tensor input.
+   * @param[out] output      Output tensor.
+   * @param[in]  block_size  Block size.
+   */
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output, int32_t block_size,
+                 const ::arm_compute::Coordinates &axises);
+
+  void run() override;
+
+private:
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_output;
+  int32_t _block_size;
+  ::arm_compute::Coordinates _axises;
+};
+
+#endif /*__SIMPLE_SPACE_TO_DEPTH_H__ */
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.cc b/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.cc
new file mode 100644
index 000000000..3f988a819
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.cc
@@ -0,0 +1,40 @@
+#include "SquaredDifferenceOperation.h"
+#include "internal/arm_compute.h"
+
+void SquaredDifferenceOperation::configure(::arm_compute::ITensor *input1,
+                                           ::arm_compute::ITensor *input2,
+                                           ::arm_compute::ITensor *output,
+                                           ::arm_compute::ConvertPolicy ConvertPolicy, float scale,
+                                           ::arm_compute::RoundingPolicy RoundingPolicy)
+{
+  _input1 = input1;
+  _input2 = input2;
+  _output = output;
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    _cl_sub.configure(CAST_CL(input1), CAST_CL(input2), CAST_CL(output), ConvertPolicy);
+    _cl_mul.configure(CAST_CL(output), CAST_CL(output), CAST_CL(output), scale, ConvertPolicy,
+                      RoundingPolicy);
+  }
+  else
+  {
+    _neon_sub.configure(CAST_NE(input1), CAST_NE(input2), CAST_NE(output), ConvertPolicy);
+    _neon_mul.configure(CAST_NE(output), CAST_NE(output), CAST_NE(output), scale, ConvertPolicy,
+                        RoundingPolicy);
+  }
+}
+
+void SquaredDifferenceOperation::run(void)
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    _cl_sub.run();
+    _cl_mul.run();
+  }
+  else
+  {
+    _neon_sub.run();
+    _neon_mul.run();
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.h b/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.h
new file mode 100644
index 000000000..3782c4e8c
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.h
@@ -0,0 +1,35 @@
+#ifndef __SQUARED_DIFFERENCE_OPERATION_H__
+#define __SQUARED_DIFFERENCE_OPERATION_H__
+
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+
+#include <arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h>
+#include <arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h>
+#include <arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h>
+#include <arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h>
+
+class SquaredDifferenceOperation : public ::arm_compute::IFunction
+{
+public:
+  void configure(::arm_compute::ITensor *input1, ::arm_compute::ITensor *input2,
+                 ::arm_compute::ITensor *output, ::arm_compute::ConvertPolicy ConvertPolicy,
+                 float scale, ::arm_compute::RoundingPolicy RoundingPolicy);
+
+public:
+  void run(void) override;
+
+private:
+  ::arm_compute::ITensor *_input1;
+  ::arm_compute::ITensor *_input2;
+
+  ::arm_compute::ITensor *_output;
+
+private:
+  ::arm_compute::CLArithmeticSubtraction _cl_sub;
+  ::arm_compute::CLPixelWiseMultiplication _cl_mul;
+
+  ::arm_compute::NEArithmeticSubtraction _neon_sub;
+  ::arm_compute::NEPixelWiseMultiplication _neon_mul;
+};
+#endif // __SQUARED_DIFFERENCE_OPERATION_H__