diff options
Diffstat (limited to 'runtimes/pure_arm_compute/src/internal/layers')
15 files changed, 1081 insertions, 0 deletions
diff --git a/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h b/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h new file mode 100644 index 000000000..502a1ee0e --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __FEATURE_LOGGING_LAYER_H__ +#define __FEATURE_LOGGING_LAYER_H__ + +#include <arm_compute/core/ITensor.h> +#include <arm_compute/runtime/IFunction.h> +#include <arm_compute/runtime/CL/CLScheduler.h> + +#include <iostream> +#include <iomanip> +#include <limits> + +#include "internal/arm_compute.h" + +class FeatureLoggingLayer : public ::arm_compute::IFunction +{ +public: + void configure(const std::string &tag, ::arm_compute::ITensor *target) + { + _tag = tag; + _target = target; + } + +public: + void run(void) override + { + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + CAST_CL(_target)->map(q); + } + + const size_t W = _target->info()->dimension(0); + const size_t H = _target->info()->dimension(1); + const size_t C = _target->info()->dimension(2); + + std::cout << _tag << std::endl; + + for (size_t ch = 0; ch < C; ++ch) + { + std::cout << "Channel #" << ch << std::endl; + for (size_t row = 0; row < H; ++row) + { + for (size_t col = 0; col < W; ++col) + { + const arm_compute::Coordinates id{col, row, ch}; + const auto value = *reinterpret_cast<float *>(_target->ptr_to_element(id)); + + // TODO Generalize this to integer types + std::cout << std::setprecision(2); + std::cout << std::setw(7); + std::cout << std::setfill(' '); + std::cout << std::fixed; + std::cout << value << " "; + } + std::cout << std::endl; + } + std::cout << std::endl; + } + + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + CAST_CL(_target)->unmap(q); + } + } + +private: + std::string _tag; + ::arm_compute::ITensor *_target; +}; + +#endif // __FEATURE_LOGGING_LAYER_H__ diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc new file mode 100644 index 000000000..311284efc --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "GenericFullyConnectedLayer.h" +#include "internal/arm_compute.h" + +#include <arm_compute/core/Helpers.h> + +void GenericFullyConnectedLayer::configure(::arm_compute::ITensor *input, + ::arm_compute::ITensor *weights, + ::arm_compute::ITensor *biases, + ::arm_compute::ITensor *output, bool needs_reshape, + ::arm_compute::TensorShape reshape) +{ + _input = input; + _weights = weights; + _biases = biases; + _output = output; + _needs_reshape = needs_reshape; + + // TODO Too many duplicated code. Revise below code. + if (::internal::arm_compute::isGpuMode()) + { + if (_needs_reshape) + { + // reshape + auto_init_if_empty(*_cl_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape)); + _generic_reshape.configure(CAST_CL(_input), &_cl_buffer); + + _cl_fc.configure(&_cl_buffer, CAST_CL(_weights), CAST_CL(_biases), CAST_CL(_output)); + + // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here. + _cl_buffer.allocator()->allocate(); + } + else + { + _cl_fc.configure(CAST_CL(_input), CAST_CL(_weights), CAST_CL(_biases), CAST_CL(_output)); + } + } + else + { + if (_needs_reshape) + { + // reshape + auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape)); + _generic_reshape.configure(CAST_NE(_input), &_neon_buffer); + + _neon_fc.configure(&_neon_buffer, CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output)); + + // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate + // here. + _neon_buffer.allocator()->allocate(); + } + else + { + _neon_fc.configure(CAST_NE(_input), CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output)); + } + } +} + +void GenericFullyConnectedLayer::run(void) +{ + if (::internal::arm_compute::isGpuMode()) + { + if (_needs_reshape) + _generic_reshape.run(); + + _cl_fc.run(); + } + else + { + if (_needs_reshape) + _generic_reshape.run(); + + _neon_fc.run(); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h new file mode 100644 index 000000000..55d8683da --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __GENERIC_FULLY_CONNECTED_LAYER_H__ +#define __GENERIC_FULLY_CONNECTED_LAYER_H__ + +#include <arm_compute/runtime/Tensor.h> +#include <arm_compute/runtime/CL/CLTensor.h> +#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h> +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h> +#include "internal/layers/GenericReshapeLayer.h" + +class GenericFullyConnectedLayer : public ::arm_compute::IFunction +{ +public: + void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *weights, + ::arm_compute::ITensor *biases, ::arm_compute::ITensor *output, bool needs_reshape, + ::arm_compute::TensorShape reshape); + +public: + void run(void) override; + +private: + ::arm_compute::ITensor *_input; + ::arm_compute::ITensor *_weights; + ::arm_compute::ITensor *_biases; + ::arm_compute::ITensor *_output; + + // buffer for reshaping input tensor + ::arm_compute::CLTensor _cl_buffer; + ::arm_compute::Tensor _neon_buffer; + +private: + ::arm_compute::CLFullyConnectedLayer _cl_fc; + ::arm_compute::NEFullyConnectedLayer _neon_fc; + GenericReshapeLayer _generic_reshape; + bool _needs_reshape; +}; + +#endif // __GENERIC_FULLY_CONNECTED_LAYER_H__ diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc new file mode 100644 index 000000000..2cdfe1b6e --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "GenericReshapeLayer.h" +#include "internal/arm_compute.h" + +void GenericReshapeLayer::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output) +{ + _input = input; + _output = output; + + // NOTE This vector comes from CLPermuteKernel implementation + // + // This implementation permutes a tensor of shape W / H / C into another tensor of shape C / W / H + // + // Original | Permuted + // 0 | W | C (from 2) + // 1 | H | W (from 0) + // 2 | C | H (from 1) + // + const ::arm_compute::PermutationVector pv{2, 0, 1}; + + if (::internal::arm_compute::isGpuMode()) + { + _cl_permute.configure(CAST_CL(input), &_cl_permuted, pv); + _cl_reshape.configure(&_cl_permuted, CAST_CL(output)); + + // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here. + _cl_permuted.allocator()->allocate(); + } + else + { + _neon_permute.configure(CAST_NE(input), &_neon_permuted, pv); + _neon_reshape.configure(&_neon_permuted, CAST_NE(output)); + + // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here. + _neon_permuted.allocator()->allocate(); + } +} + +void GenericReshapeLayer::run(void) +{ + if (::internal::arm_compute::isGpuMode()) + { + _cl_permute.run(); + _cl_reshape.run(); + } + else + { + _neon_permute.run(); + _neon_reshape.run(); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h new file mode 100644 index 000000000..1def21085 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __GENERIC_RESHAPE_LAYER_H__ +#define __GENERIC_RESHAPE_LAYER_H__ + +#include <arm_compute/runtime/Tensor.h> +#include <arm_compute/runtime/CL/CLTensor.h> + +#include <arm_compute/runtime/CL/functions/CLPermute.h> +#include <arm_compute/runtime/CL/functions/CLReshapeLayer.h> +#include <arm_compute/runtime/NEON/functions/NEPermute.h> +#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h> + +class GenericReshapeLayer : public ::arm_compute::IFunction +{ +public: + void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output); + +public: + void run(void) override; + +private: + ::arm_compute::ITensor *_input; + ::arm_compute::ITensor *_output; + ::arm_compute::CLTensor _cl_permuted; + ::arm_compute::Tensor _neon_permuted; + +private: + ::arm_compute::CLPermute _cl_permute; + ::arm_compute::CLReshapeLayer _cl_reshape; + + ::arm_compute::NEPermute _neon_permute; + ::arm_compute::NEReshapeLayer _neon_reshape; +}; + +#endif // __GENERIC_RESHAPE_LAYER_H__ diff --git a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc new file mode 100644 index 000000000..4a5370587 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc @@ -0,0 +1,78 @@ +/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include "PadLayer.h"
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void PadLayer::configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output,
+ unsigned int border_width)
+{
+ _input = input;
+ _output = output;
+ _border_width = border_width;
+ _output_height = _output->info()->dimension(0);
+ _output_width = _output->info()->dimension(1);
+
+ uint8_t constant_border_value = 0;
+ ::arm_compute::PixelValue constant_pixel_value = ::arm_compute::PixelValue(constant_border_value);
+
+ unsigned int padding_size = _border_width;
+ input->info()->extend_padding(::arm_compute::PaddingSize{padding_size});
+ _fillborderkernel.configure(input, _border_width, ::arm_compute::BorderMode::CONSTANT,
+ constant_pixel_value);
+}
+
+void PadLayer::run(void)
+{
+ _fillborderkernel.run();
+
+ ::arm_compute::Coordinates coordinates =
+ ::arm_compute::Coordinates(-_border_width, -_border_width);
+ ::arm_compute::TensorShape new_tensor_shape =
+ ::arm_compute::TensorShape(_output_height, _output_width);
+
+ /* NOTE: The cl kernel fills the data in the borders(not in the tensor).
+ Once the tensor is received back at NNAPI, we are adjusting
+ the valid region in such a way that the padding becomes part of the tensor itself
+ and matches the size of output. */
+ _input->info()->set_valid_region(::arm_compute::ValidRegion(coordinates, new_tensor_shape));
+
+ /* NOTE: Since cl kernel does not have an argument for output tensor while NNAPI does.
+ We need to map the input (tensor that is passed to the cl kernel) back to
+ output. */
+
+ // TODO: Write a modified CLCopy kernel to do this job.
+ populateOutput();
+}
+
+void PadLayer::populateOutput()
+{
+ auto &queue = ::arm_compute::CLScheduler::get().queue();
+ _input->map(queue);
+ _output->map(queue);
+
+ auto input_tensor = static_cast<::arm_compute::ITensor *>(_input);
+ auto const source_data = input_tensor->buffer();
+
+ auto output_tensor = static_cast<::arm_compute::ITensor *>(_output);
+ auto dst_data = output_tensor->buffer();
+
+ memmove(dst_data, source_data, _output_height * _output_width * 4);
+
+ _input->unmap(queue);
+ _output->unmap(queue);
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h new file mode 100644 index 000000000..cb3f36337 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h @@ -0,0 +1,41 @@ +/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PAD_LAYER_H__
+#define __PAD_LAYER_H__
+
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/functions/CLFillBorder.h>
+
+class PadLayer : public ::arm_compute::IFunction
+{
+public:
+ void configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output,
+ unsigned int border_width);
+ void run(void) override;
+
+private:
+ ::arm_compute::ICLTensor *_input;
+ ::arm_compute::ICLTensor *_output;
+ int _border_width;
+ int _output_height;
+ int _output_width;
+
+ ::arm_compute::CLFillBorder _fillborderkernel;
+ void populateOutput();
+};
+
+#endif // __PAD_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h new file mode 100644 index 000000000..31c927b4f --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __SIMPLE_ARITHMETIC_ADDITION_H__ +#define __SIMPLE_ARITHMETIC_ADDITION_H__ + +#include "internal/arm_compute.h" +#include <arm_compute/core/ITensor.h> + +class SimpleArithmeticAddition : public ::arm_compute::IFunction +{ +public: + void configure(::arm_compute::ITensor *lhs, ::arm_compute::ITensor *rhs, + ::arm_compute::ITensor *out) + { + _lhs = lhs; + _rhs = rhs; + _out = out; + } + +public: + void run(void) override + { + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_lhs)->map(q); + CAST_CL(_rhs)->map(q); + CAST_CL(_out)->map(q); + } + + arm_compute::Window window; + window.use_tensor_dimensions(_out->info()->tensor_shape()); + + execute_window_loop(window, [this](const arm_compute::Coordinates &id) { + // NOTE Must be two input tensors of identical type + // Must be output tensor of the same type as input0. + assert(_lhs->info()->data_type() == _rhs->info()->data_type()); + assert(_lhs->info()->data_type() == _out->info()->data_type()); + + switch (_lhs->info()->data_type()) + { + case ::arm_compute::DataType::F32: + { + const auto lhs_value = *reinterpret_cast<float *>(_lhs->ptr_to_element(id)); + const auto rhs_value = *reinterpret_cast<float *>(_rhs->ptr_to_element(id)); + *reinterpret_cast<float *>(_out->ptr_to_element(id)) = lhs_value + rhs_value; + break; + } + case ::arm_compute::DataType::S32: + { + const auto lhs_value = *reinterpret_cast<int32_t *>(_lhs->ptr_to_element(id)); + const auto rhs_value = *reinterpret_cast<int32_t *>(_rhs->ptr_to_element(id)); + *reinterpret_cast<int32_t *>(_out->ptr_to_element(id)) = lhs_value + rhs_value; + break; + } + case ::arm_compute::DataType::U32: + { + const auto lhs_value = *reinterpret_cast<uint32_t *>(_lhs->ptr_to_element(id)); + const auto rhs_value = *reinterpret_cast<uint32_t *>(_rhs->ptr_to_element(id)); + *reinterpret_cast<uint32_t *>(_out->ptr_to_element(id)) = lhs_value + rhs_value; + break; + } + case ::arm_compute::DataType::QASYMM8: + { + const auto lhs_value = *reinterpret_cast<uint8_t *>(_lhs->ptr_to_element(id)); + const auto rhs_value = *reinterpret_cast<uint8_t *>(_rhs->ptr_to_element(id)); + // How to handle with overflow? + *reinterpret_cast<uint8_t *>(_out->ptr_to_element(id)) = lhs_value + rhs_value; + break; + } + default: + throw std::runtime_error("Not supported, yet"); + break; + } + }); + + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_out)->unmap(q); + CAST_CL(_rhs)->unmap(q); + CAST_CL(_lhs)->unmap(q); + } + } + +private: + ::arm_compute::ITensor *_lhs; + ::arm_compute::ITensor *_rhs; + ::arm_compute::ITensor *_out; +}; + +#endif // __SIMPLE_ARITHMETIC_ADDITION_H__ diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h new file mode 100644 index 000000000..fa3006438 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __SIMPLE_CAST_LAYER_H__ +#define __SIMPLE_CAST_LAYER_H__ + +#include <arm_compute/core/ITensor.h> + +#include "internal/arm_compute.h" +#include "internal/op/Cast.h" + +class SimpleCastLayer : public ::arm_compute::IFunction +{ +public: + void configure(::arm_compute::ITensor *in, ::arm_compute::ITensor *out) + { + _in = in; + _out = out; + } + +public: + void run(void) override + { + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + CAST_CL(_in)->map(q); + CAST_CL(_out)->map(q); + } + + arm_compute::Window window; + window.use_tensor_dimensions(_out->info()->tensor_shape()); + + execute_window_loop(window, + [this](const arm_compute::Coordinates &id) { castData(_in, _out, id); }); + + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + CAST_CL(_out)->unmap(q); + CAST_CL(_in)->unmap(q); + } + } + + void castData(::arm_compute::ITensor *in, ::arm_compute::ITensor *out, + const arm_compute::Coordinates &id) + { + switch (in->info()->data_type()) + { + case ::arm_compute::DataType::F32: + { + copyCast(*reinterpret_cast<float *>(in->ptr_to_element(id)), out, id); + break; + } + case ::arm_compute::DataType::S32: + { + copyCast(*reinterpret_cast<int32_t *>(in->ptr_to_element(id)), out, id); + break; + } + case ::arm_compute::DataType::U32: + { + copyCast(*reinterpret_cast<uint32_t *>(in->ptr_to_element(id)), out, id); + break; + } + case ::arm_compute::DataType::QASYMM8: + { + const uint8_t quantizedValue = *(in->ptr_to_element(id)); + copyCast(in->info()->quantization_info().dequantize(quantizedValue), out, id); + break; + } + default: + throw std::runtime_error("Not supported, yet"); + break; + } + } + +private: + ::arm_compute::ITensor *_in; + ::arm_compute::ITensor *_out; +}; + +#endif // __SIMPLE_CAST_LAYER_H__ diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc new file mode 100644 index 000000000..089c783c1 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc @@ -0,0 +1,115 @@ +#include "internal/layers/SimpleEmbeddingLookup.h" + +#include <arm_compute/runtime/CL/CLScheduler.h> + +void SimpleEmbeddingLookup::configure(::arm_compute::ITensor *lookups, + ::arm_compute::ITensor *values, + ::arm_compute::ITensor *output) +{ + // Assume that verification of operands are already done at Planner::visit() + _lookups = lookups; + _values = values; + _output = output; +} + +void SimpleEmbeddingLookup::run() +{ + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_lookups)->map(q); + CAST_CL(_values)->map(q); + CAST_CL(_output)->map(q); + } + + // type of elements of lookups is always integer + const int32_t *lookups_buf = reinterpret_cast<int32_t *>(_lookups->buffer()); + const auto values_buf = _values->buffer(); + auto output_buf = _output->buffer(); + + const auto lookups_info = _lookups->info(); + const auto values_info = _values->info(); + const auto output_info = _output->info(); + + // TODO Refactor below duplicated code! + const auto values_rank = values_info->num_dimensions(); + switch (values_rank) + { + case 2: + // (H,W) in nnapi -> (W,H) in acl + { + const size_t row_size = values_info->dimension(1); + const size_t row_bytes = values_info->total_size() / row_size; + for (size_t i = 0; i < lookups_info->dimension(0); ++i) + { + if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size) + throw std::runtime_error("Embedding Lookup: index out of bounds."); + + size_t idx = lookups_buf[i]; + size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, idx}); + size_t row_offset_by_i = output_info->offset_element_in_bytes({0, i}); + + unsigned char *sink_addr = output_buf + row_offset_by_i; + unsigned char *source_addr = values_buf + row_offset_by_idx; + memcpy(sink_addr, source_addr, row_bytes); + } + } + break; + case 3: + // (B,H,W) in nnapi -> (W,H,B) in acl + { + const size_t row_size = values_info->dimension(2); + const size_t row_bytes = values_info->total_size() / row_size; + for (size_t i = 0; i < lookups_info->dimension(0); ++i) + { + if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size) + throw std::runtime_error("Embedding Lookup: index out of bounds."); + + size_t idx = lookups_buf[i]; + size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, 0, idx}); + size_t row_offset_by_i = output_info->offset_element_in_bytes({0, 0, i}); + + unsigned char *sink_addr = output_buf + row_offset_by_i; + unsigned char *source_addr = values_buf + row_offset_by_idx; + memcpy(sink_addr, source_addr, row_bytes); + } + } + break; + case 4: + // (N,H,W,C) in nnapi -> (N,C,H,W) in acl + { + const size_t row_size = values_info->dimension(3); + const size_t row_bytes = values_info->total_size() / row_size; + for (size_t i = 0; i < lookups_info->dimension(0); ++i) + { + if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size) + throw std::runtime_error("Embedding Lookup: index out of bounds."); + + size_t idx = lookups_buf[i]; + size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, 0, 0, idx}); + size_t row_offset_by_i = output_info->offset_element_in_bytes({0, 0, 0, i}); + + unsigned char *sink_addr = output_buf + row_offset_by_i; + unsigned char *source_addr = values_buf + row_offset_by_idx; + memcpy(sink_addr, source_addr, row_bytes); + } + } + break; + case 1: + // In this case, shape of values actually is matrix but the height(row size) is 1 in acl. If + // row size is 1, this op is not needed and it means this situtation could be wrong. + throw std::runtime_error("Wrong usage of EmbeddingLookup op!"); + default: + throw std::runtime_error("Not supported rank!"); + } + + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_lookups)->unmap(q); + CAST_CL(_values)->unmap(q); + CAST_CL(_output)->unmap(q); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h new file mode 100644 index 000000000..9f2cd977f --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h @@ -0,0 +1,22 @@ +#ifndef __SIMPLE_EMBEDDING_LOOKUP_H__ +#define __SIMPLE_EMBEDDING_LOOKUP_H__ + +#include "internal/arm_compute.h" +#include <arm_compute/core/ITensor.h> +#include <arm_compute/runtime/IFunction.h> + +class SimpleEmbeddingLookup : public ::arm_compute::IFunction +{ +public: + void configure(::arm_compute::ITensor *lookups, ::arm_compute::ITensor *values, + ::arm_compute::ITensor *output); + + void run() override; + +private: + ::arm_compute::ITensor *_lookups; + ::arm_compute::ITensor *_values; + ::arm_compute::ITensor *_output; +}; + +#endif /*__SIMPLE_EMBEDDING_LOOKUP_H__ */ diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc new file mode 100644 index 000000000..682295f81 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "internal/layers/SimpleSpaceToDepth.h" + +#include <arm_compute/runtime/CL/CLScheduler.h> + +void SimpleSpaceToDepth::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output, + int32_t block_size, + const ::arm_compute::Coordinates &axises = {3, 1, 0, 2}) +{ + assert(input->info()->num_dimensions() == 4); + assert(output->info()->num_dimensions() == 4); + const auto rank = axises.num_dimensions(); + assert(rank == 4); + for (int i = 0; i < rank; ++i) + { + assert(axises[i] >= 0); + assert(axises[i] < rank); + } + + _input = input; + _output = output; + _block_size = block_size; + _axises = axises; +} + +inline int32_t Offset4D(const ::arm_compute::TensorShape &shape, int32_t b, int32_t h, int32_t w, + int32_t d, const ::arm_compute::Coordinates &axises) +{ + // b, h, w, d >= 0 + size_t indexes[4]; + indexes[axises[0]] = b; + indexes[axises[1]] = h; + indexes[axises[2]] = w; + indexes[axises[3]] = d; + + int32_t offset = indexes[3] * shape[2] * shape[1] * shape[0]; + offset += indexes[2] * shape[1] * shape[0]; + offset += indexes[1] * shape[0]; + offset += indexes[0]; + return offset; +} + +template <typename T> +inline void SpaceToDepth(const T *input_data, const ::arm_compute::TensorShape &input_shape, + int32_t block_size, T *output_data, + const ::arm_compute::TensorShape &output_shape, + const ::arm_compute::Coordinates &axises) +{ + const int input_batch = input_shape[axises[0]]; + const int input_height = input_shape[axises[1]]; + const int input_width = input_shape[axises[2]]; + const int input_depth = input_shape[axises[3]]; + + const int output_batch = output_shape[axises[0]]; + const int output_height = output_shape[axises[1]]; + const int output_width = output_shape[axises[2]]; + const int output_depth = output_shape[axises[3]]; + + assert(input_batch == output_batch); + assert(input_height == output_height * block_size); + assert(input_width == output_width * block_size); + assert(input_depth * block_size * block_size == output_depth); + + for (int in_b = 0; in_b < input_batch; ++in_b) + { + for (int in_h = 0; in_h < input_height; ++in_h) + { + for (int in_w = 0; in_w < input_width; ++in_w) + { + for (int in_d = 0; in_d < input_depth; ++in_d) + { + const int out_b = in_b; + const int out_h = in_h / block_size; + const int out_w = in_w / block_size; + const int out_d = + in_d + ((in_h % block_size) * block_size + in_w % block_size) * input_depth; + + const int input_index = Offset4D(input_shape, in_b, in_h, in_w, in_d, axises); + const int output_index = Offset4D(output_shape, out_b, out_h, out_w, out_d, axises); + + output_data[output_index] = input_data[input_index]; + } + } + } + } +} + +void SimpleSpaceToDepth::run() +{ + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_input)->map(q); + CAST_CL(_output)->map(q); + } + + auto input_buf = _input->buffer(); + auto output_buf = _output->buffer(); + switch (_input->info()->data_type()) + { + case ::arm_compute::DataType::U8: + case ::arm_compute::DataType::QASYMM8: + SpaceToDepth(reinterpret_cast<const uint8_t *>(input_buf), _input->info()->tensor_shape(), + _block_size, reinterpret_cast<uint8_t *>(output_buf), + _output->info()->tensor_shape(), _axises); + break; + case ::arm_compute::DataType::S8: + SpaceToDepth(reinterpret_cast<const int8_t *>(input_buf), _input->info()->tensor_shape(), + _block_size, reinterpret_cast<int8_t *>(output_buf), + _output->info()->tensor_shape(), _axises); + break; + case ::arm_compute::DataType::U32: + SpaceToDepth(reinterpret_cast<const uint32_t *>(input_buf), _input->info()->tensor_shape(), + _block_size, reinterpret_cast<uint32_t *>(output_buf), + _output->info()->tensor_shape(), _axises); + break; + case ::arm_compute::DataType::S32: + SpaceToDepth(reinterpret_cast<const int32_t *>(input_buf), _input->info()->tensor_shape(), + _block_size, reinterpret_cast<int32_t *>(output_buf), + _output->info()->tensor_shape(), _axises); + break; + case ::arm_compute::DataType::F32: + SpaceToDepth(reinterpret_cast<const float *>(input_buf), _input->info()->tensor_shape(), + _block_size, reinterpret_cast<float *>(output_buf), + _output->info()->tensor_shape(), _axises); + break; + default: + ARM_COMPUTE_ERROR("DataType not supported"); + break; + } + + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_input)->unmap(q); + CAST_CL(_output)->unmap(q); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h new file mode 100644 index 000000000..f5e028b1c --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __SIMPLE_SPACE_TO_DEPTH_H__ +#define __SIMPLE_SPACE_TO_DEPTH_H__ + +#include "internal/arm_compute.h" +#include <arm_compute/core/ITensor.h> +#include <arm_compute/runtime/IFunction.h> + +class SimpleSpaceToDepth : public ::arm_compute::IFunction +{ +public: + /** Initialise input and output + * + * @param[in] input First tensor input. + * @param[out] output Output tensor. + * @param[in] block_size Block size. + */ + void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output, int32_t block_size, + const ::arm_compute::Coordinates &axises); + + void run() override; + +private: + ::arm_compute::ITensor *_input; + ::arm_compute::ITensor *_output; + int32_t _block_size; + ::arm_compute::Coordinates _axises; +}; + +#endif /*__SIMPLE_SPACE_TO_DEPTH_H__ */ diff --git a/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.cc b/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.cc new file mode 100644 index 000000000..3f988a819 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.cc @@ -0,0 +1,40 @@ +#include "SquaredDifferenceOperation.h" +#include "internal/arm_compute.h" + +void SquaredDifferenceOperation::configure(::arm_compute::ITensor *input1, + ::arm_compute::ITensor *input2, + ::arm_compute::ITensor *output, + ::arm_compute::ConvertPolicy ConvertPolicy, float scale, + ::arm_compute::RoundingPolicy RoundingPolicy) +{ + _input1 = input1; + _input2 = input2; + _output = output; + + if (::internal::arm_compute::isGpuMode()) + { + _cl_sub.configure(CAST_CL(input1), CAST_CL(input2), CAST_CL(output), ConvertPolicy); + _cl_mul.configure(CAST_CL(output), CAST_CL(output), CAST_CL(output), scale, ConvertPolicy, + RoundingPolicy); + } + else + { + _neon_sub.configure(CAST_NE(input1), CAST_NE(input2), CAST_NE(output), ConvertPolicy); + _neon_mul.configure(CAST_NE(output), CAST_NE(output), CAST_NE(output), scale, ConvertPolicy, + RoundingPolicy); + } +} + +void SquaredDifferenceOperation::run(void) +{ + if (::internal::arm_compute::isGpuMode()) + { + _cl_sub.run(); + _cl_mul.run(); + } + else + { + _neon_sub.run(); + _neon_mul.run(); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.h b/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.h new file mode 100644 index 000000000..3782c4e8c --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.h @@ -0,0 +1,35 @@ +#ifndef __SQUARED_DIFFERENCE_OPERATION_H__ +#define __SQUARED_DIFFERENCE_OPERATION_H__ + +#include <arm_compute/runtime/Tensor.h> +#include <arm_compute/runtime/CL/CLTensor.h> + +#include <arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h> +#include <arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h> +#include <arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h> +#include <arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h> + +class SquaredDifferenceOperation : public ::arm_compute::IFunction +{ +public: + void configure(::arm_compute::ITensor *input1, ::arm_compute::ITensor *input2, + ::arm_compute::ITensor *output, ::arm_compute::ConvertPolicy ConvertPolicy, + float scale, ::arm_compute::RoundingPolicy RoundingPolicy); + +public: + void run(void) override; + +private: + ::arm_compute::ITensor *_input1; + ::arm_compute::ITensor *_input2; + + ::arm_compute::ITensor *_output; + +private: + ::arm_compute::CLArithmeticSubtraction _cl_sub; + ::arm_compute::CLPixelWiseMultiplication _cl_mul; + + ::arm_compute::NEArithmeticSubtraction _neon_sub; + ::arm_compute::NEPixelWiseMultiplication _neon_mul; +}; +#endif // __SQUARED_DIFFERENCE_OPERATION_H__ |