summaryrefslogtreecommitdiff
path: root/runtimes/pure_arm_compute/src/internal/layers
diff options
context:
space:
mode:
Diffstat (limited to 'runtimes/pure_arm_compute/src/internal/layers')
-rw-r--r--runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h88
-rw-r--r--runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc90
-rw-r--r--runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h53
-rw-r--r--runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc66
-rw-r--r--runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h50
-rw-r--r--runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc78
-rw-r--r--runtimes/pure_arm_compute/src/internal/layers/PadLayer.h41
-rw-r--r--runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h108
-rw-r--r--runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h95
-rw-r--r--runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc115
-rw-r--r--runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h22
-rw-r--r--runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc155
-rw-r--r--runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h45
-rw-r--r--runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.cc40
-rw-r--r--runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.h35
15 files changed, 1081 insertions, 0 deletions
diff --git a/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h b/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h
new file mode 100644
index 000000000..502a1ee0e
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FEATURE_LOGGING_LAYER_H__
+#define __FEATURE_LOGGING_LAYER_H__
+
+#include <arm_compute/core/ITensor.h>
+#include <arm_compute/runtime/IFunction.h>
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+#include <iostream>
+#include <iomanip>
+#include <limits>
+
+#include "internal/arm_compute.h"
+
+class FeatureLoggingLayer : public ::arm_compute::IFunction
+{
+public:
+ void configure(const std::string &tag, ::arm_compute::ITensor *target)
+ {
+ _tag = tag;
+ _target = target;
+ }
+
+public:
+ void run(void) override
+ {
+ if (::internal::arm_compute::isGpuMode())
+ {
+ auto &q = ::arm_compute::CLScheduler::get().queue();
+ CAST_CL(_target)->map(q);
+ }
+
+ const size_t W = _target->info()->dimension(0);
+ const size_t H = _target->info()->dimension(1);
+ const size_t C = _target->info()->dimension(2);
+
+ std::cout << _tag << std::endl;
+
+ for (size_t ch = 0; ch < C; ++ch)
+ {
+ std::cout << "Channel #" << ch << std::endl;
+ for (size_t row = 0; row < H; ++row)
+ {
+ for (size_t col = 0; col < W; ++col)
+ {
+ const arm_compute::Coordinates id{col, row, ch};
+ const auto value = *reinterpret_cast<float *>(_target->ptr_to_element(id));
+
+ // TODO Generalize this to integer types
+ std::cout << std::setprecision(2);
+ std::cout << std::setw(7);
+ std::cout << std::setfill(' ');
+ std::cout << std::fixed;
+ std::cout << value << " ";
+ }
+ std::cout << std::endl;
+ }
+ std::cout << std::endl;
+ }
+
+ if (::internal::arm_compute::isGpuMode())
+ {
+ auto &q = ::arm_compute::CLScheduler::get().queue();
+ CAST_CL(_target)->unmap(q);
+ }
+ }
+
+private:
+ std::string _tag;
+ ::arm_compute::ITensor *_target;
+};
+
+#endif // __FEATURE_LOGGING_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc
new file mode 100644
index 000000000..311284efc
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenericFullyConnectedLayer.h"
+#include "internal/arm_compute.h"
+
+#include <arm_compute/core/Helpers.h>
+
+void GenericFullyConnectedLayer::configure(::arm_compute::ITensor *input,
+ ::arm_compute::ITensor *weights,
+ ::arm_compute::ITensor *biases,
+ ::arm_compute::ITensor *output, bool needs_reshape,
+ ::arm_compute::TensorShape reshape)
+{
+ _input = input;
+ _weights = weights;
+ _biases = biases;
+ _output = output;
+ _needs_reshape = needs_reshape;
+
+ // TODO Too many duplicated code. Revise below code.
+ if (::internal::arm_compute::isGpuMode())
+ {
+ if (_needs_reshape)
+ {
+ // reshape
+ auto_init_if_empty(*_cl_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
+ _generic_reshape.configure(CAST_CL(_input), &_cl_buffer);
+
+ _cl_fc.configure(&_cl_buffer, CAST_CL(_weights), CAST_CL(_biases), CAST_CL(_output));
+
+ // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
+ _cl_buffer.allocator()->allocate();
+ }
+ else
+ {
+ _cl_fc.configure(CAST_CL(_input), CAST_CL(_weights), CAST_CL(_biases), CAST_CL(_output));
+ }
+ }
+ else
+ {
+ if (_needs_reshape)
+ {
+ // reshape
+ auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
+ _generic_reshape.configure(CAST_NE(_input), &_neon_buffer);
+
+ _neon_fc.configure(&_neon_buffer, CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output));
+
+ // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate
+ // here.
+ _neon_buffer.allocator()->allocate();
+ }
+ else
+ {
+ _neon_fc.configure(CAST_NE(_input), CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output));
+ }
+ }
+}
+
+void GenericFullyConnectedLayer::run(void)
+{
+ if (::internal::arm_compute::isGpuMode())
+ {
+ if (_needs_reshape)
+ _generic_reshape.run();
+
+ _cl_fc.run();
+ }
+ else
+ {
+ if (_needs_reshape)
+ _generic_reshape.run();
+
+ _neon_fc.run();
+ }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h
new file mode 100644
index 000000000..55d8683da
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __GENERIC_FULLY_CONNECTED_LAYER_H__
+#define __GENERIC_FULLY_CONNECTED_LAYER_H__
+
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
+#include "internal/layers/GenericReshapeLayer.h"
+
+class GenericFullyConnectedLayer : public ::arm_compute::IFunction
+{
+public:
+ void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *weights,
+ ::arm_compute::ITensor *biases, ::arm_compute::ITensor *output, bool needs_reshape,
+ ::arm_compute::TensorShape reshape);
+
+public:
+ void run(void) override;
+
+private:
+ ::arm_compute::ITensor *_input;
+ ::arm_compute::ITensor *_weights;
+ ::arm_compute::ITensor *_biases;
+ ::arm_compute::ITensor *_output;
+
+ // buffer for reshaping input tensor
+ ::arm_compute::CLTensor _cl_buffer;
+ ::arm_compute::Tensor _neon_buffer;
+
+private:
+ ::arm_compute::CLFullyConnectedLayer _cl_fc;
+ ::arm_compute::NEFullyConnectedLayer _neon_fc;
+ GenericReshapeLayer _generic_reshape;
+ bool _needs_reshape;
+};
+
+#endif // __GENERIC_FULLY_CONNECTED_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc
new file mode 100644
index 000000000..2cdfe1b6e
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenericReshapeLayer.h"
+#include "internal/arm_compute.h"
+
+void GenericReshapeLayer::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output)
+{
+ _input = input;
+ _output = output;
+
+ // NOTE This vector comes from CLPermuteKernel implementation
+ //
+ // This implementation permutes a tensor of shape W / H / C into another tensor of shape C / W / H
+ //
+ // Original | Permuted
+ // 0 | W | C (from 2)
+ // 1 | H | W (from 0)
+ // 2 | C | H (from 1)
+ //
+ const ::arm_compute::PermutationVector pv{2, 0, 1};
+
+ if (::internal::arm_compute::isGpuMode())
+ {
+ _cl_permute.configure(CAST_CL(input), &_cl_permuted, pv);
+ _cl_reshape.configure(&_cl_permuted, CAST_CL(output));
+
+ // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+ _cl_permuted.allocator()->allocate();
+ }
+ else
+ {
+ _neon_permute.configure(CAST_NE(input), &_neon_permuted, pv);
+ _neon_reshape.configure(&_neon_permuted, CAST_NE(output));
+
+ // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+ _neon_permuted.allocator()->allocate();
+ }
+}
+
+void GenericReshapeLayer::run(void)
+{
+ if (::internal::arm_compute::isGpuMode())
+ {
+ _cl_permute.run();
+ _cl_reshape.run();
+ }
+ else
+ {
+ _neon_permute.run();
+ _neon_reshape.run();
+ }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h
new file mode 100644
index 000000000..1def21085
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __GENERIC_RESHAPE_LAYER_H__
+#define __GENERIC_RESHAPE_LAYER_H__
+
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+
+#include <arm_compute/runtime/CL/functions/CLPermute.h>
+#include <arm_compute/runtime/CL/functions/CLReshapeLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEPermute.h>
+#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>
+
+class GenericReshapeLayer : public ::arm_compute::IFunction
+{
+public:
+ void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output);
+
+public:
+ void run(void) override;
+
+private:
+ ::arm_compute::ITensor *_input;
+ ::arm_compute::ITensor *_output;
+ ::arm_compute::CLTensor _cl_permuted;
+ ::arm_compute::Tensor _neon_permuted;
+
+private:
+ ::arm_compute::CLPermute _cl_permute;
+ ::arm_compute::CLReshapeLayer _cl_reshape;
+
+ ::arm_compute::NEPermute _neon_permute;
+ ::arm_compute::NEReshapeLayer _neon_reshape;
+};
+
+#endif // __GENERIC_RESHAPE_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc
new file mode 100644
index 000000000..4a5370587
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include "PadLayer.h"
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void PadLayer::configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output,
+ unsigned int border_width)
+{
+ _input = input;
+ _output = output;
+ _border_width = border_width;
+ _output_height = _output->info()->dimension(0);
+ _output_width = _output->info()->dimension(1);
+
+ uint8_t constant_border_value = 0;
+ ::arm_compute::PixelValue constant_pixel_value = ::arm_compute::PixelValue(constant_border_value);
+
+ unsigned int padding_size = _border_width;
+ input->info()->extend_padding(::arm_compute::PaddingSize{padding_size});
+ _fillborderkernel.configure(input, _border_width, ::arm_compute::BorderMode::CONSTANT,
+ constant_pixel_value);
+}
+
+void PadLayer::run(void)
+{
+ _fillborderkernel.run();
+
+ ::arm_compute::Coordinates coordinates =
+ ::arm_compute::Coordinates(-_border_width, -_border_width);
+ ::arm_compute::TensorShape new_tensor_shape =
+ ::arm_compute::TensorShape(_output_height, _output_width);
+
+ /* NOTE: The cl kernel fills the data in the borders(not in the tensor).
+ Once the tensor is received back at NNAPI, we are adjusting
+ the valid region in such a way that the padding becomes part of the tensor itself
+ and matches the size of output. */
+ _input->info()->set_valid_region(::arm_compute::ValidRegion(coordinates, new_tensor_shape));
+
+ /* NOTE: Since cl kernel does not have an argument for output tensor while NNAPI does.
+ We need to map the input (tensor that is passed to the cl kernel) back to
+ output. */
+
+ // TODO: Write a modified CLCopy kernel to do this job.
+ populateOutput();
+}
+
+void PadLayer::populateOutput()
+{
+ auto &queue = ::arm_compute::CLScheduler::get().queue();
+ _input->map(queue);
+ _output->map(queue);
+
+ auto input_tensor = static_cast<::arm_compute::ITensor *>(_input);
+ auto const source_data = input_tensor->buffer();
+
+ auto output_tensor = static_cast<::arm_compute::ITensor *>(_output);
+ auto dst_data = output_tensor->buffer();
+
+ memmove(dst_data, source_data, _output_height * _output_width * 4);
+
+ _input->unmap(queue);
+ _output->unmap(queue);
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h
new file mode 100644
index 000000000..cb3f36337
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PAD_LAYER_H__
+#define __PAD_LAYER_H__
+
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/functions/CLFillBorder.h>
+
+class PadLayer : public ::arm_compute::IFunction
+{
+public:
+ void configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output,
+ unsigned int border_width);
+ void run(void) override;
+
+private:
+ ::arm_compute::ICLTensor *_input;
+ ::arm_compute::ICLTensor *_output;
+ int _border_width;
+ int _output_height;
+ int _output_width;
+
+ ::arm_compute::CLFillBorder _fillborderkernel;
+ void populateOutput();
+};
+
+#endif // __PAD_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h
new file mode 100644
index 000000000..31c927b4f
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SIMPLE_ARITHMETIC_ADDITION_H__
+#define __SIMPLE_ARITHMETIC_ADDITION_H__
+
+#include "internal/arm_compute.h"
+#include <arm_compute/core/ITensor.h>
+
+class SimpleArithmeticAddition : public ::arm_compute::IFunction
+{
+public:
+ void configure(::arm_compute::ITensor *lhs, ::arm_compute::ITensor *rhs,
+ ::arm_compute::ITensor *out)
+ {
+ _lhs = lhs;
+ _rhs = rhs;
+ _out = out;
+ }
+
+public:
+ void run(void) override
+ {
+ if (::internal::arm_compute::isGpuMode())
+ {
+ auto &q = ::arm_compute::CLScheduler::get().queue();
+
+ CAST_CL(_lhs)->map(q);
+ CAST_CL(_rhs)->map(q);
+ CAST_CL(_out)->map(q);
+ }
+
+ arm_compute::Window window;
+ window.use_tensor_dimensions(_out->info()->tensor_shape());
+
+ execute_window_loop(window, [this](const arm_compute::Coordinates &id) {
+ // NOTE Must be two input tensors of identical type
+ // Must be output tensor of the same type as input0.
+ assert(_lhs->info()->data_type() == _rhs->info()->data_type());
+ assert(_lhs->info()->data_type() == _out->info()->data_type());
+
+ switch (_lhs->info()->data_type())
+ {
+ case ::arm_compute::DataType::F32:
+ {
+ const auto lhs_value = *reinterpret_cast<float *>(_lhs->ptr_to_element(id));
+ const auto rhs_value = *reinterpret_cast<float *>(_rhs->ptr_to_element(id));
+ *reinterpret_cast<float *>(_out->ptr_to_element(id)) = lhs_value + rhs_value;
+ break;
+ }
+ case ::arm_compute::DataType::S32:
+ {
+ const auto lhs_value = *reinterpret_cast<int32_t *>(_lhs->ptr_to_element(id));
+ const auto rhs_value = *reinterpret_cast<int32_t *>(_rhs->ptr_to_element(id));
+ *reinterpret_cast<int32_t *>(_out->ptr_to_element(id)) = lhs_value + rhs_value;
+ break;
+ }
+ case ::arm_compute::DataType::U32:
+ {
+ const auto lhs_value = *reinterpret_cast<uint32_t *>(_lhs->ptr_to_element(id));
+ const auto rhs_value = *reinterpret_cast<uint32_t *>(_rhs->ptr_to_element(id));
+ *reinterpret_cast<uint32_t *>(_out->ptr_to_element(id)) = lhs_value + rhs_value;
+ break;
+ }
+ case ::arm_compute::DataType::QASYMM8:
+ {
+ const auto lhs_value = *reinterpret_cast<uint8_t *>(_lhs->ptr_to_element(id));
+ const auto rhs_value = *reinterpret_cast<uint8_t *>(_rhs->ptr_to_element(id));
+ // How to handle with overflow?
+ *reinterpret_cast<uint8_t *>(_out->ptr_to_element(id)) = lhs_value + rhs_value;
+ break;
+ }
+ default:
+ throw std::runtime_error("Not supported, yet");
+ break;
+ }
+ });
+
+ if (::internal::arm_compute::isGpuMode())
+ {
+ auto &q = ::arm_compute::CLScheduler::get().queue();
+
+ CAST_CL(_out)->unmap(q);
+ CAST_CL(_rhs)->unmap(q);
+ CAST_CL(_lhs)->unmap(q);
+ }
+ }
+
+private:
+ ::arm_compute::ITensor *_lhs;
+ ::arm_compute::ITensor *_rhs;
+ ::arm_compute::ITensor *_out;
+};
+
+#endif // __SIMPLE_ARITHMETIC_ADDITION_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h
new file mode 100644
index 000000000..fa3006438
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SIMPLE_CAST_LAYER_H__
+#define __SIMPLE_CAST_LAYER_H__
+
+#include <arm_compute/core/ITensor.h>
+
+#include "internal/arm_compute.h"
+#include "internal/op/Cast.h"
+
+class SimpleCastLayer : public ::arm_compute::IFunction
+{
+public:
+ void configure(::arm_compute::ITensor *in, ::arm_compute::ITensor *out)
+ {
+ _in = in;
+ _out = out;
+ }
+
+public:
+ void run(void) override
+ {
+ if (::internal::arm_compute::isGpuMode())
+ {
+ auto &q = ::arm_compute::CLScheduler::get().queue();
+ CAST_CL(_in)->map(q);
+ CAST_CL(_out)->map(q);
+ }
+
+ arm_compute::Window window;
+ window.use_tensor_dimensions(_out->info()->tensor_shape());
+
+ execute_window_loop(window,
+ [this](const arm_compute::Coordinates &id) { castData(_in, _out, id); });
+
+ if (::internal::arm_compute::isGpuMode())
+ {
+ auto &q = ::arm_compute::CLScheduler::get().queue();
+ CAST_CL(_out)->unmap(q);
+ CAST_CL(_in)->unmap(q);
+ }
+ }
+
+ void castData(::arm_compute::ITensor *in, ::arm_compute::ITensor *out,
+ const arm_compute::Coordinates &id)
+ {
+ switch (in->info()->data_type())
+ {
+ case ::arm_compute::DataType::F32:
+ {
+ copyCast(*reinterpret_cast<float *>(in->ptr_to_element(id)), out, id);
+ break;
+ }
+ case ::arm_compute::DataType::S32:
+ {
+ copyCast(*reinterpret_cast<int32_t *>(in->ptr_to_element(id)), out, id);
+ break;
+ }
+ case ::arm_compute::DataType::U32:
+ {
+ copyCast(*reinterpret_cast<uint32_t *>(in->ptr_to_element(id)), out, id);
+ break;
+ }
+ case ::arm_compute::DataType::QASYMM8:
+ {
+ const uint8_t quantizedValue = *(in->ptr_to_element(id));
+ copyCast(in->info()->quantization_info().dequantize(quantizedValue), out, id);
+ break;
+ }
+ default:
+ throw std::runtime_error("Not supported, yet");
+ break;
+ }
+ }
+
+private:
+ ::arm_compute::ITensor *_in;
+ ::arm_compute::ITensor *_out;
+};
+
+#endif // __SIMPLE_CAST_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc
new file mode 100644
index 000000000..089c783c1
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc
@@ -0,0 +1,115 @@
+#include "internal/layers/SimpleEmbeddingLookup.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void SimpleEmbeddingLookup::configure(::arm_compute::ITensor *lookups,
+ ::arm_compute::ITensor *values,
+ ::arm_compute::ITensor *output)
+{
+ // Assume that verification of operands are already done at Planner::visit()
+ _lookups = lookups;
+ _values = values;
+ _output = output;
+}
+
+void SimpleEmbeddingLookup::run()
+{
+ if (::internal::arm_compute::isGpuMode())
+ {
+ auto &q = ::arm_compute::CLScheduler::get().queue();
+
+ CAST_CL(_lookups)->map(q);
+ CAST_CL(_values)->map(q);
+ CAST_CL(_output)->map(q);
+ }
+
+ // type of elements of lookups is always integer
+ const int32_t *lookups_buf = reinterpret_cast<int32_t *>(_lookups->buffer());
+ const auto values_buf = _values->buffer();
+ auto output_buf = _output->buffer();
+
+ const auto lookups_info = _lookups->info();
+ const auto values_info = _values->info();
+ const auto output_info = _output->info();
+
+ // TODO Refactor below duplicated code!
+ const auto values_rank = values_info->num_dimensions();
+ switch (values_rank)
+ {
+ case 2:
+ // (H,W) in nnapi -> (W,H) in acl
+ {
+ const size_t row_size = values_info->dimension(1);
+ const size_t row_bytes = values_info->total_size() / row_size;
+ for (size_t i = 0; i < lookups_info->dimension(0); ++i)
+ {
+ if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size)
+ throw std::runtime_error("Embedding Lookup: index out of bounds.");
+
+ size_t idx = lookups_buf[i];
+ size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, idx});
+ size_t row_offset_by_i = output_info->offset_element_in_bytes({0, i});
+
+ unsigned char *sink_addr = output_buf + row_offset_by_i;
+ unsigned char *source_addr = values_buf + row_offset_by_idx;
+ memcpy(sink_addr, source_addr, row_bytes);
+ }
+ }
+ break;
+ case 3:
+ // (B,H,W) in nnapi -> (W,H,B) in acl
+ {
+ const size_t row_size = values_info->dimension(2);
+ const size_t row_bytes = values_info->total_size() / row_size;
+ for (size_t i = 0; i < lookups_info->dimension(0); ++i)
+ {
+ if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size)
+ throw std::runtime_error("Embedding Lookup: index out of bounds.");
+
+ size_t idx = lookups_buf[i];
+ size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, 0, idx});
+ size_t row_offset_by_i = output_info->offset_element_in_bytes({0, 0, i});
+
+ unsigned char *sink_addr = output_buf + row_offset_by_i;
+ unsigned char *source_addr = values_buf + row_offset_by_idx;
+ memcpy(sink_addr, source_addr, row_bytes);
+ }
+ }
+ break;
+ case 4:
+ // (N,H,W,C) in nnapi -> (N,C,H,W) in acl
+ {
+ const size_t row_size = values_info->dimension(3);
+ const size_t row_bytes = values_info->total_size() / row_size;
+ for (size_t i = 0; i < lookups_info->dimension(0); ++i)
+ {
+ if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size)
+ throw std::runtime_error("Embedding Lookup: index out of bounds.");
+
+ size_t idx = lookups_buf[i];
+ size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, 0, 0, idx});
+ size_t row_offset_by_i = output_info->offset_element_in_bytes({0, 0, 0, i});
+
+ unsigned char *sink_addr = output_buf + row_offset_by_i;
+ unsigned char *source_addr = values_buf + row_offset_by_idx;
+ memcpy(sink_addr, source_addr, row_bytes);
+ }
+ }
+ break;
+ case 1:
+ // In this case, shape of values actually is matrix but the height(row size) is 1 in acl. If
+ // row size is 1, this op is not needed and it means this situtation could be wrong.
+ throw std::runtime_error("Wrong usage of EmbeddingLookup op!");
+ default:
+ throw std::runtime_error("Not supported rank!");
+ }
+
+ if (::internal::arm_compute::isGpuMode())
+ {
+ auto &q = ::arm_compute::CLScheduler::get().queue();
+
+ CAST_CL(_lookups)->unmap(q);
+ CAST_CL(_values)->unmap(q);
+ CAST_CL(_output)->unmap(q);
+ }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h
new file mode 100644
index 000000000..9f2cd977f
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h
@@ -0,0 +1,22 @@
+#ifndef __SIMPLE_EMBEDDING_LOOKUP_H__
+#define __SIMPLE_EMBEDDING_LOOKUP_H__
+
+#include "internal/arm_compute.h"
+#include <arm_compute/core/ITensor.h>
+#include <arm_compute/runtime/IFunction.h>
+
+class SimpleEmbeddingLookup : public ::arm_compute::IFunction
+{
+public:
+ void configure(::arm_compute::ITensor *lookups, ::arm_compute::ITensor *values,
+ ::arm_compute::ITensor *output);
+
+ void run() override;
+
+private:
+ ::arm_compute::ITensor *_lookups;
+ ::arm_compute::ITensor *_values;
+ ::arm_compute::ITensor *_output;
+};
+
+#endif /*__SIMPLE_EMBEDDING_LOOKUP_H__ */
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc
new file mode 100644
index 000000000..682295f81
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "internal/layers/SimpleSpaceToDepth.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void SimpleSpaceToDepth::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output,
+ int32_t block_size,
+ const ::arm_compute::Coordinates &axises = {3, 1, 0, 2})
+{
+ assert(input->info()->num_dimensions() == 4);
+ assert(output->info()->num_dimensions() == 4);
+ const auto rank = axises.num_dimensions();
+ assert(rank == 4);
+ for (int i = 0; i < rank; ++i)
+ {
+ assert(axises[i] >= 0);
+ assert(axises[i] < rank);
+ }
+
+ _input = input;
+ _output = output;
+ _block_size = block_size;
+ _axises = axises;
+}
+
+inline int32_t Offset4D(const ::arm_compute::TensorShape &shape, int32_t b, int32_t h, int32_t w,
+ int32_t d, const ::arm_compute::Coordinates &axises)
+{
+ // b, h, w, d >= 0
+ size_t indexes[4];
+ indexes[axises[0]] = b;
+ indexes[axises[1]] = h;
+ indexes[axises[2]] = w;
+ indexes[axises[3]] = d;
+
+ int32_t offset = indexes[3] * shape[2] * shape[1] * shape[0];
+ offset += indexes[2] * shape[1] * shape[0];
+ offset += indexes[1] * shape[0];
+ offset += indexes[0];
+ return offset;
+}
+
+template <typename T>
+inline void SpaceToDepth(const T *input_data, const ::arm_compute::TensorShape &input_shape,
+ int32_t block_size, T *output_data,
+ const ::arm_compute::TensorShape &output_shape,
+ const ::arm_compute::Coordinates &axises)
+{
+ const int input_batch = input_shape[axises[0]];
+ const int input_height = input_shape[axises[1]];
+ const int input_width = input_shape[axises[2]];
+ const int input_depth = input_shape[axises[3]];
+
+ const int output_batch = output_shape[axises[0]];
+ const int output_height = output_shape[axises[1]];
+ const int output_width = output_shape[axises[2]];
+ const int output_depth = output_shape[axises[3]];
+
+ assert(input_batch == output_batch);
+ assert(input_height == output_height * block_size);
+ assert(input_width == output_width * block_size);
+ assert(input_depth * block_size * block_size == output_depth);
+
+ for (int in_b = 0; in_b < input_batch; ++in_b)
+ {
+ for (int in_h = 0; in_h < input_height; ++in_h)
+ {
+ for (int in_w = 0; in_w < input_width; ++in_w)
+ {
+ for (int in_d = 0; in_d < input_depth; ++in_d)
+ {
+ const int out_b = in_b;
+ const int out_h = in_h / block_size;
+ const int out_w = in_w / block_size;
+ const int out_d =
+ in_d + ((in_h % block_size) * block_size + in_w % block_size) * input_depth;
+
+ const int input_index = Offset4D(input_shape, in_b, in_h, in_w, in_d, axises);
+ const int output_index = Offset4D(output_shape, out_b, out_h, out_w, out_d, axises);
+
+ output_data[output_index] = input_data[input_index];
+ }
+ }
+ }
+ }
+}
+
+void SimpleSpaceToDepth::run()
+{
+ if (::internal::arm_compute::isGpuMode())
+ {
+ auto &q = ::arm_compute::CLScheduler::get().queue();
+
+ CAST_CL(_input)->map(q);
+ CAST_CL(_output)->map(q);
+ }
+
+ auto input_buf = _input->buffer();
+ auto output_buf = _output->buffer();
+ switch (_input->info()->data_type())
+ {
+ case ::arm_compute::DataType::U8:
+ case ::arm_compute::DataType::QASYMM8:
+ SpaceToDepth(reinterpret_cast<const uint8_t *>(input_buf), _input->info()->tensor_shape(),
+ _block_size, reinterpret_cast<uint8_t *>(output_buf),
+ _output->info()->tensor_shape(), _axises);
+ break;
+ case ::arm_compute::DataType::S8:
+ SpaceToDepth(reinterpret_cast<const int8_t *>(input_buf), _input->info()->tensor_shape(),
+ _block_size, reinterpret_cast<int8_t *>(output_buf),
+ _output->info()->tensor_shape(), _axises);
+ break;
+ case ::arm_compute::DataType::U32:
+ SpaceToDepth(reinterpret_cast<const uint32_t *>(input_buf), _input->info()->tensor_shape(),
+ _block_size, reinterpret_cast<uint32_t *>(output_buf),
+ _output->info()->tensor_shape(), _axises);
+ break;
+ case ::arm_compute::DataType::S32:
+ SpaceToDepth(reinterpret_cast<const int32_t *>(input_buf), _input->info()->tensor_shape(),
+ _block_size, reinterpret_cast<int32_t *>(output_buf),
+ _output->info()->tensor_shape(), _axises);
+ break;
+ case ::arm_compute::DataType::F32:
+ SpaceToDepth(reinterpret_cast<const float *>(input_buf), _input->info()->tensor_shape(),
+ _block_size, reinterpret_cast<float *>(output_buf),
+ _output->info()->tensor_shape(), _axises);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("DataType not supported");
+ break;
+ }
+
+ if (::internal::arm_compute::isGpuMode())
+ {
+ auto &q = ::arm_compute::CLScheduler::get().queue();
+
+ CAST_CL(_input)->unmap(q);
+ CAST_CL(_output)->unmap(q);
+ }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h
new file mode 100644
index 000000000..f5e028b1c
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SIMPLE_SPACE_TO_DEPTH_H__
+#define __SIMPLE_SPACE_TO_DEPTH_H__
+
+#include "internal/arm_compute.h"
+#include <arm_compute/core/ITensor.h>
+#include <arm_compute/runtime/IFunction.h>
+
+class SimpleSpaceToDepth : public ::arm_compute::IFunction
+{
+public:
+ /** Initialise input and output
+ *
+ * @param[in] input First tensor input.
+ * @param[out] output Output tensor.
+ * @param[in] block_size Block size.
+ */
+ void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output, int32_t block_size,
+ const ::arm_compute::Coordinates &axises);
+
+ void run() override;
+
+private:
+ ::arm_compute::ITensor *_input;
+ ::arm_compute::ITensor *_output;
+ int32_t _block_size;
+ ::arm_compute::Coordinates _axises;
+};
+
+#endif /*__SIMPLE_SPACE_TO_DEPTH_H__ */
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.cc b/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.cc
new file mode 100644
index 000000000..3f988a819
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.cc
@@ -0,0 +1,40 @@
+#include "SquaredDifferenceOperation.h"
+#include "internal/arm_compute.h"
+
+void SquaredDifferenceOperation::configure(::arm_compute::ITensor *input1,
+ ::arm_compute::ITensor *input2,
+ ::arm_compute::ITensor *output,
+ ::arm_compute::ConvertPolicy ConvertPolicy, float scale,
+ ::arm_compute::RoundingPolicy RoundingPolicy)
+{
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ if (::internal::arm_compute::isGpuMode())
+ {
+ _cl_sub.configure(CAST_CL(input1), CAST_CL(input2), CAST_CL(output), ConvertPolicy);
+ _cl_mul.configure(CAST_CL(output), CAST_CL(output), CAST_CL(output), scale, ConvertPolicy,
+ RoundingPolicy);
+ }
+ else
+ {
+ _neon_sub.configure(CAST_NE(input1), CAST_NE(input2), CAST_NE(output), ConvertPolicy);
+ _neon_mul.configure(CAST_NE(output), CAST_NE(output), CAST_NE(output), scale, ConvertPolicy,
+ RoundingPolicy);
+ }
+}
+
+void SquaredDifferenceOperation::run(void)
+{
+ if (::internal::arm_compute::isGpuMode())
+ {
+ _cl_sub.run();
+ _cl_mul.run();
+ }
+ else
+ {
+ _neon_sub.run();
+ _neon_mul.run();
+ }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.h b/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.h
new file mode 100644
index 000000000..3782c4e8c
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.h
@@ -0,0 +1,35 @@
+#ifndef __SQUARED_DIFFERENCE_OPERATION_H__
+#define __SQUARED_DIFFERENCE_OPERATION_H__
+
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+
+#include <arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h>
+#include <arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h>
+#include <arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h>
+#include <arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h>
+
+class SquaredDifferenceOperation : public ::arm_compute::IFunction
+{
+public:
+ void configure(::arm_compute::ITensor *input1, ::arm_compute::ITensor *input2,
+ ::arm_compute::ITensor *output, ::arm_compute::ConvertPolicy ConvertPolicy,
+ float scale, ::arm_compute::RoundingPolicy RoundingPolicy);
+
+public:
+ void run(void) override;
+
+private:
+ ::arm_compute::ITensor *_input1;
+ ::arm_compute::ITensor *_input2;
+
+ ::arm_compute::ITensor *_output;
+
+private:
+ ::arm_compute::CLArithmeticSubtraction _cl_sub;
+ ::arm_compute::CLPixelWiseMultiplication _cl_mul;
+
+ ::arm_compute::NEArithmeticSubtraction _neon_sub;
+ ::arm_compute::NEPixelWiseMultiplication _neon_mul;
+};
+#endif // __SQUARED_DIFFERENCE_OPERATION_H__