diff options
Diffstat (limited to 'runtimes/pure_arm_compute/src/internal/layers')
37 files changed, 2194 insertions, 403 deletions
diff --git a/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h b/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h index 502a1ee0e..83ae7c17b 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h +++ b/runtimes/pure_arm_compute/src/internal/layers/FeatureLoggingLayer.h @@ -14,6 +14,12 @@ * limitations under the License. */ +/** + * @file FeatureLoggingLayer.h + * @brief This file contains FeatureLoggingLayer class + * @ingroup COM_AI_RUNTIME + */ + #ifndef __FEATURE_LOGGING_LAYER_H__ #define __FEATURE_LOGGING_LAYER_H__ @@ -27,9 +33,24 @@ #include "internal/arm_compute.h" +/** + * @brief Class to run FeatureLogging Layer + */ class FeatureLoggingLayer : public ::arm_compute::IFunction { public: + FeatureLoggingLayer(void) : _tag(""), _target(nullptr) + { + // DO NOTHING + } + +public: + /** + * @brief Configure the layer + * @param[in] tag Text tag for this layer + * @param[in] target The feature tensor to be printed + * @return N/A + */ void configure(const std::string &tag, ::arm_compute::ITensor *target) { _tag = tag; @@ -37,6 +58,10 @@ public: } public: + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ void run(void) override { if (::internal::arm_compute::isGpuMode()) diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc index 311284efc..28789a801 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc +++ b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc @@ -17,8 +17,6 @@ #include "GenericFullyConnectedLayer.h" #include "internal/arm_compute.h" -#include <arm_compute/core/Helpers.h> - void GenericFullyConnectedLayer::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *weights, ::arm_compute::ITensor *biases, @@ -56,9 +54,9 @@ void GenericFullyConnectedLayer::configure(::arm_compute::ITensor *input, { // reshape auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape)); - _generic_reshape.configure(CAST_NE(_input), &_neon_buffer); + _generic_reshape.configure(_input, &_neon_buffer); - _neon_fc.configure(&_neon_buffer, CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output)); + _neon_fc.configure(&_neon_buffer, _weights, _biases, _output); // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate // here. @@ -66,7 +64,7 @@ void GenericFullyConnectedLayer::configure(::arm_compute::ITensor *input, } else { - _neon_fc.configure(CAST_NE(_input), CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output)); + _neon_fc.configure(_input, _weights, _biases, _output); } } } diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h index 55d8683da..f1519f54d 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h +++ b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h @@ -14,23 +14,52 @@ * limitations under the License. */ +/** + * @file GenericFullyConnectedLayer.h + * @brief This file contains GenericFullyConnectedLayer class + * @ingroup COM_AI_RUNTIME + */ + #ifndef __GENERIC_FULLY_CONNECTED_LAYER_H__ #define __GENERIC_FULLY_CONNECTED_LAYER_H__ -#include <arm_compute/runtime/Tensor.h> -#include <arm_compute/runtime/CL/CLTensor.h> #include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h> #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h> #include "internal/layers/GenericReshapeLayer.h" +/** + * @brief Class to run FullyConnected Layer with both CPU and GPU + */ class GenericFullyConnectedLayer : public ::arm_compute::IFunction { public: + GenericFullyConnectedLayer(void) + : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{}, + _neon_buffer{}, _cl_fc{}, _neon_fc{}, _generic_reshape{}, _needs_reshape(false) + { + // DO NOTHING + } + +public: + /** + * @brief Configure the layer + * @param[in] input The source tensor + * @param[in] weights The tensor that is filled with weight values + * @param[in] biases The tensor that is filled with biase values + * @param[in] output The destination tensor + * @param[in] needs_reshape Whether it needs to be reshaped or not + * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true. + * @return N/A + */ void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *weights, ::arm_compute::ITensor *biases, ::arm_compute::ITensor *output, bool needs_reshape, ::arm_compute::TensorShape reshape); public: + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ void run(void) override; private: diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc index 2cdfe1b6e..c38c2e9e3 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc +++ b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.cc @@ -43,8 +43,8 @@ void GenericReshapeLayer::configure(::arm_compute::ITensor *input, ::arm_compute } else { - _neon_permute.configure(CAST_NE(input), &_neon_permuted, pv); - _neon_reshape.configure(&_neon_permuted, CAST_NE(output)); + _neon_permute.configure(input, &_neon_permuted, pv); + _neon_reshape.configure(&_neon_permuted, output); // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here. _neon_permuted.allocator()->allocate(); diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h index 1def21085..a22c14c8b 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h +++ b/runtimes/pure_arm_compute/src/internal/layers/GenericReshapeLayer.h @@ -14,6 +14,12 @@ * limitations under the License. */ +/** + * @file GenericReshapeLayer.h + * @brief This file contains GenericReshapeLayer class + * @ingroup COM_AI_RUNTIME + */ + #ifndef __GENERIC_RESHAPE_LAYER_H__ #define __GENERIC_RESHAPE_LAYER_H__ @@ -25,12 +31,33 @@ #include <arm_compute/runtime/NEON/functions/NEPermute.h> #include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h> +/** + * @brief Class to run Reshape Layer with both CPU and GPU + */ class GenericReshapeLayer : public ::arm_compute::IFunction { public: + GenericReshapeLayer(void) + : _input(nullptr), _output(nullptr), _cl_permuted{}, _neon_permuted{}, _cl_permute{}, + _cl_reshape{}, _neon_permute{}, _neon_reshape{} + { + // DO NOTHING + } + +public: + /** + * @brief Configure the layer + * @param[in] input The source tensor + * @param[in] output The destination tensor + * @return N/A + */ void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output); public: + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ void run(void) override; private: diff --git a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc deleted file mode 100644 index 4a5370587..000000000 --- a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.cc +++ /dev/null @@ -1,78 +0,0 @@ -/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <iostream>
-#include "PadLayer.h"
-#include <arm_compute/runtime/CL/CLScheduler.h>
-
-void PadLayer::configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output,
- unsigned int border_width)
-{
- _input = input;
- _output = output;
- _border_width = border_width;
- _output_height = _output->info()->dimension(0);
- _output_width = _output->info()->dimension(1);
-
- uint8_t constant_border_value = 0;
- ::arm_compute::PixelValue constant_pixel_value = ::arm_compute::PixelValue(constant_border_value);
-
- unsigned int padding_size = _border_width;
- input->info()->extend_padding(::arm_compute::PaddingSize{padding_size});
- _fillborderkernel.configure(input, _border_width, ::arm_compute::BorderMode::CONSTANT,
- constant_pixel_value);
-}
-
-void PadLayer::run(void)
-{
- _fillborderkernel.run();
-
- ::arm_compute::Coordinates coordinates =
- ::arm_compute::Coordinates(-_border_width, -_border_width);
- ::arm_compute::TensorShape new_tensor_shape =
- ::arm_compute::TensorShape(_output_height, _output_width);
-
- /* NOTE: The cl kernel fills the data in the borders(not in the tensor).
- Once the tensor is received back at NNAPI, we are adjusting
- the valid region in such a way that the padding becomes part of the tensor itself
- and matches the size of output. */
- _input->info()->set_valid_region(::arm_compute::ValidRegion(coordinates, new_tensor_shape));
-
- /* NOTE: Since cl kernel does not have an argument for output tensor while NNAPI does.
- We need to map the input (tensor that is passed to the cl kernel) back to
- output. */
-
- // TODO: Write a modified CLCopy kernel to do this job.
- populateOutput();
-}
-
-void PadLayer::populateOutput()
-{
- auto &queue = ::arm_compute::CLScheduler::get().queue();
- _input->map(queue);
- _output->map(queue);
-
- auto input_tensor = static_cast<::arm_compute::ITensor *>(_input);
- auto const source_data = input_tensor->buffer();
-
- auto output_tensor = static_cast<::arm_compute::ITensor *>(_output);
- auto dst_data = output_tensor->buffer();
-
- memmove(dst_data, source_data, _output_height * _output_width * 4);
-
- _input->unmap(queue);
- _output->unmap(queue);
-}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleArgMinMax.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleArgMinMax.cc new file mode 100644 index 000000000..6d348e814 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleArgMinMax.cc @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "internal/layers/SimpleArgMinMax.h" +#include <arm_compute/runtime/CL/CLScheduler.h> + +void SimpleArgMinMax::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output, + std::vector<uint32_t> axis, ::arm_compute::ArgOperation op) +{ + _input = input; + _output = output; + _axis = axis; + _input_rank = input->info()->num_dimensions(); + _op_type = op; +} + +inline const ::arm_compute::TensorShape +inferOutputShape(const ::arm_compute::TensorShape &input_shape, const std::vector<uint32_t> &axis, + int input_rank) +{ + ::arm_compute::TensorShape out_shape{}; + size_t dim = 1; + for (int i = 0; i < input_rank; ++i) + { + dim = input_shape[i]; + out_shape.set(i, dim); + } + + for (int i = 0; i < axis.size(); ++i) + { + out_shape.set(axis[i], 1); + } + + return out_shape; +} + +template <typename T> +inline T getArgMinMaxEle(const ::arm_compute::ITensor *input, + const ::arm_compute::TensorShape &input_shape, + const ::arm_compute::TensorShape &output_shape, const size_t b, + const size_t d, const size_t h, const size_t w, const int axis, + const ::arm_compute::ArgOperation op_type) +{ + // If output[dimention] == 1, will check all values of that dimension because of reducing + // dimension. + // Else will check only one value. + const size_t start_b = output_shape[3] == 1 ? 0 : b; + const size_t start_d = output_shape[2] == 1 ? 0 : d; + const size_t start_h = output_shape[1] == 1 ? 0 : h; + const size_t start_w = output_shape[0] == 1 ? 0 : w; + const size_t stop_b = output_shape[3] == 1 ? input_shape[3] - 1 : b; + const size_t stop_d = output_shape[2] == 1 ? input_shape[2] - 1 : d; + const size_t stop_h = output_shape[1] == 1 ? input_shape[1] - 1 : h; + const size_t stop_w = output_shape[0] == 1 ? input_shape[0] - 1 : w; + + ::arm_compute::Coordinates id{w, h, d, b}; + ::arm_compute::Coordinates min_max_id{w, h, d, b}; + + T value = *reinterpret_cast<T *>(input->ptr_to_element(id)); + T tval = *reinterpret_cast<T *>(input->ptr_to_element(id)); + + for (size_t in_b = start_b; in_b <= stop_b; ++in_b) + { + id.set(3, in_b); + for (size_t in_d = start_d; in_d <= stop_d; ++in_d) + { + id.set(2, in_d); + for (size_t in_h = start_h; in_h <= stop_h; ++in_h) + { + id.set(1, in_h); + for (size_t in_w = start_w; in_w <= stop_w; ++in_w) + { + id.set(0, in_w); + if (op_type == ::arm_compute::ArgOperation::MIN) + { + value = std::min<T>(value, *reinterpret_cast<T *>(input->ptr_to_element(id))); + } + else if (op_type == ::arm_compute::ArgOperation::MAX) + { + value = std::max<T>(value, *reinterpret_cast<T *>(input->ptr_to_element(id))); + } + else + throw std::runtime_error("This Arg operation is not supported, yet"); + + if (tval != value) + { + min_max_id = id; + tval = value; + } + } + } + } + } + + return min_max_id[axis]; +} + +template <typename T> +inline void +getArgMinMax(const ::arm_compute::ITensor *input, const ::arm_compute::TensorShape &input_shape, + const ::arm_compute::TensorShape &output_shape, ::arm_compute::ITensor *output, + const int axis, const ::arm_compute::ArgOperation op_type) +{ + ::arm_compute::Coordinates id; + for (size_t out_b = 0; out_b < output_shape[3]; ++out_b) + { + id.set(3, out_b); + for (size_t out_d = 0; out_d < output_shape[2]; ++out_d) + { + id.set(2, out_d); + for (size_t out_h = 0; out_h < output_shape[1]; ++out_h) + { + id.set(1, out_h); + for (size_t out_w = 0; out_w < output_shape[0]; ++out_w) + { + id.set(0, out_w); + *reinterpret_cast<int *>(output->ptr_to_element(id)) = getArgMinMaxEle<T>( + input, input_shape, output_shape, out_b, out_d, out_h, out_w, axis, op_type); + } + } + } + } +} + +void SimpleArgMinMax::run() +{ + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_input)->map(q); + CAST_CL(_output)->map(q); + } + + ::arm_compute::TensorShape input_shape = _input->info()->tensor_shape(); + + // Axis dimension is 1 and size is 1. + // TODO support axis size > 1. + int axis_val = _axis[0]; + ::arm_compute::TensorShape output_shape = inferOutputShape(input_shape, _axis, _input_rank); + + _output->info()->set_tensor_shape(output_shape); + switch (_input->info()->data_type()) + { + case ::arm_compute::DataType::QASYMM8: + getArgMinMax<uint8_t>(_input, input_shape, output_shape, _output, axis_val, _op_type); + break; + case ::arm_compute::DataType::S32: + getArgMinMax<int32_t>(_input, input_shape, output_shape, _output, axis_val, _op_type); + break; + case ::arm_compute::DataType::F32: + getArgMinMax<float>(_input, input_shape, output_shape, _output, axis_val, _op_type); + break; + default: + ARM_COMPUTE_ERROR("DataType not supported"); + break; + } + + _output->info()->set_tensor_shape(output_shape); + + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_input)->unmap(q); + CAST_CL(_output)->unmap(q); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleArgMinMax.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleArgMinMax.h new file mode 100644 index 000000000..b90e74579 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleArgMinMax.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __SIMPLE_ARG_MIN_MAX_H__ +#define __SIMPLE_ARG_MIN_MAX_H__ + +#include "internal/arm_compute.h" +#include "arm_compute/core/TypesEx.h" + +class SimpleArgMinMax : public ::arm_compute::IFunction +{ +public: + SimpleArgMinMax(void) : _input(nullptr), _output(nullptr), _axis(), _input_rank(0) + { + // DO NOTHING + } + +public: + /** Initialise input and output + * + * @param[in] input First tensor input. + * @param[out] output Output tensor. + * @param[in] axis Dimension along which to find Min or Max Index. + */ + void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output, + std::vector<uint32_t> axis, ::arm_compute::ArgOperation _op_type); + + void run() override; + +private: + ::arm_compute::ITensor *_input; + ::arm_compute::ITensor *_output; + std::vector<uint32_t> _axis; + int _input_rank; + ::arm_compute::ArgOperation _op_type; +}; + +#endif /*__SIMPLE_ARG_MIN_MAX_H__ */ diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h index 31c927b4f..aed9ae286 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleArithmeticAddition.h @@ -14,15 +14,36 @@ * limitations under the License. */ +/** + * @file SimpleArithmeticAddition.h + * @brief This file contains SimpleArithmeticAddition class + * @ingroup COM_AI_RUNTIME + */ + #ifndef __SIMPLE_ARITHMETIC_ADDITION_H__ #define __SIMPLE_ARITHMETIC_ADDITION_H__ #include "internal/arm_compute.h" #include <arm_compute/core/ITensor.h> +/** + * @brief Class to run SimpleArithmeticAddition Layer + */ class SimpleArithmeticAddition : public ::arm_compute::IFunction { public: + SimpleArithmeticAddition(void) : _lhs(nullptr), _rhs(nullptr), _out(nullptr) + { + // DO NOTHING + } + + /** + * @brief Configure the layer + * @param[in] lhs Lefthand-side operand + * @param[in] rhs Righthand-side operand + * @param[in] out The destination tensor(Result operand) + * @return N/A + */ void configure(::arm_compute::ITensor *lhs, ::arm_compute::ITensor *rhs, ::arm_compute::ITensor *out) { @@ -32,6 +53,10 @@ public: } public: + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ void run(void) override { if (::internal::arm_compute::isGpuMode()) diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.cc new file mode 100644 index 000000000..87175ee1a --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.cc @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "internal/layers/SimpleBatchToSpaceNd.h" + +#include <arm_compute/runtime/CL/CLScheduler.h> + +void SimpleBatchToSpaceND::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output, + const int32_t *block_size, + const ::arm_compute::Coordinates &axises) +{ + const auto rank = axises.num_dimensions(); + assert(rank == 4); + + for (int i = 0; i < rank; ++i) + assert(axises[i] >= 0 && axises[i] < rank); + + _input = input; + _output = output; + _block_size = block_size; + _axises = axises; +} + +template <typename T> +inline void BatchToSpaceND(const ::arm_compute::ITensor *input, + const ::arm_compute::TensorShape &input_shape, + const int32_t *block_size_data, ::arm_compute::ITensor *output, + const ::arm_compute::TensorShape &output_shape, + const ::arm_compute::Coordinates &axises) +{ + const int output_batch = output_shape[axises[0]]; + const int output_height = output_shape[axises[1]]; + const int output_width = output_shape[axises[2]]; + const int depth = output_shape[axises[3]]; + + for (int out_b = 0; out_b < output_batch; ++out_b) + { + for (int out_h = 0; out_h < output_height; ++out_h) + { + for (int out_w = 0; out_w < output_width; ++out_w) + { + for (int out_d = 0; out_d < depth; ++out_d) + { + const int in_d = out_d; + const int in_h = out_h / block_size_data[0]; + const int in_w = out_w / block_size_data[1]; + const int in_b = + out_b + + ((out_h % block_size_data[0]) * block_size_data[1] + out_w % block_size_data[1]) * + output_batch; + + auto input_id = + asARMComputeCoordinates(::arm_compute::Coordinates{in_b, in_h, in_w, in_d}, axises); + auto output_id = asARMComputeCoordinates( + ::arm_compute::Coordinates{out_b, out_h, out_w, out_d}, axises); + + *reinterpret_cast<T *>(output->ptr_to_element(output_id)) = + *reinterpret_cast<T *>(input->ptr_to_element(input_id)); + } + } + } + } +} +void SimpleBatchToSpaceND::run() +{ + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_input)->map(q); + CAST_CL(_output)->map(q); + } + + switch (_input->info()->data_type()) + { + case ::arm_compute::DataType::U8: + case ::arm_compute::DataType::QASYMM8: + BatchToSpaceND<uint8_t>(_input, _input->info()->tensor_shape(), _block_size, _output, + _output->info()->tensor_shape(), _axises); + break; + case ::arm_compute::DataType::F32: + BatchToSpaceND<float>(_input, _input->info()->tensor_shape(), _block_size, _output, + _output->info()->tensor_shape(), _axises); + break; + default: + ARM_COMPUTE_ERROR("DataType not supported"); + break; + } + + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_input)->unmap(q); + CAST_CL(_output)->unmap(q); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.h new file mode 100644 index 000000000..5695d9719 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleBatchToSpaceNd.h @@ -0,0 +1,51 @@ +/* + *Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __SIMPLE_BATCH_TO_SPACE_ND_H__ +#define __SIMPLE_BATCH_TO_SPACE_ND_H__ + +#include "internal/arm_compute.h" +#include "internal/arm_compute/Cast.h" + +class SimpleBatchToSpaceND : public ::arm_compute::IFunction +{ +public: + SimpleBatchToSpaceND(void) : _input(nullptr), _output(nullptr), _block_size(nullptr), _axises{} + { + // DO NOTHING + } + + /** Initialise input and output + * + * @param[in] input First tensor input. + * @param[out] output Output tensor. + * @param[in] block_size Block size. + * @param[in] axises Axises of rank 4 + */ + void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output, + const int32_t *block_size, + const ::arm_compute::Coordinates &axises = getARMComputeAxises(4)); + + void run() override; + +private: + ::arm_compute::ITensor *_input; + ::arm_compute::ITensor *_output; + const int32_t *_block_size; + ::arm_compute::Coordinates _axises; +}; + +#endif /*__SIMPLE_BATCH_TO_SPACE_ND_H__ */ diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.cc new file mode 100644 index 000000000..7c7706a78 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.cc @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "internal/layers/SimpleCastLayer.h" + +#include <arm_compute/runtime/CL/CLScheduler.h> + +void SimpleCastLayer::castData(::arm_compute::ITensor *in, ::arm_compute::ITensor *out, + const arm_compute::Coordinates &id) +{ + switch (in->info()->data_type()) + { + case ::arm_compute::DataType::F32: + { + copyCast(*reinterpret_cast<float *>(in->ptr_to_element(id)), out, id); + break; + } + case ::arm_compute::DataType::S32: + { + copyCast(*reinterpret_cast<int32_t *>(in->ptr_to_element(id)), out, id); + break; + } + case ::arm_compute::DataType::U32: + { + copyCast(*reinterpret_cast<uint32_t *>(in->ptr_to_element(id)), out, id); + break; + } + case ::arm_compute::DataType::QASYMM8: + { + const uint8_t quantizedValue = *(in->ptr_to_element(id)); + copyCast(in->info()->quantization_info().dequantize(quantizedValue), out, id); + break; + } + default: + throw std::runtime_error("Not supported, yet"); + break; + } +} + +void SimpleCastLayer::configure(::arm_compute::ITensor *in, ::arm_compute::ITensor *out) +{ + _in = in; + _out = out; +} + +void SimpleCastLayer::run(void) +{ + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + CAST_CL(_in)->map(q); + CAST_CL(_out)->map(q); + } + + arm_compute::Window window; + window.use_tensor_dimensions(_out->info()->tensor_shape()); + + execute_window_loop(window, + [this](const arm_compute::Coordinates &id) { castData(_in, _out, id); }); + + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + CAST_CL(_out)->unmap(q); + CAST_CL(_in)->unmap(q); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h index fa3006438..f9a48b481 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleCastLayer.h @@ -14,80 +14,55 @@ * limitations under the License. */ +/** + * @file SimpleCastLayer.h + * @brief This file contains SimpleCastLayer class + * @ingroup COM_AI_RUNTIME + */ + #ifndef __SIMPLE_CAST_LAYER_H__ #define __SIMPLE_CAST_LAYER_H__ -#include <arm_compute/core/ITensor.h> - #include "internal/arm_compute.h" -#include "internal/op/Cast.h" +#include "internal/arm_compute/Cast.h" +/** + * @brief Class to run SimpleCast Layer + */ class SimpleCastLayer : public ::arm_compute::IFunction { public: - void configure(::arm_compute::ITensor *in, ::arm_compute::ITensor *out) + SimpleCastLayer(void) : _in(nullptr), _out(nullptr) { - _in = in; - _out = out; + // DO NOTHING } -public: - void run(void) override - { - if (::internal::arm_compute::isGpuMode()) - { - auto &q = ::arm_compute::CLScheduler::get().queue(); - CAST_CL(_in)->map(q); - CAST_CL(_out)->map(q); - } - - arm_compute::Window window; - window.use_tensor_dimensions(_out->info()->tensor_shape()); + /** + * @brief Configure the layer + * @param[in] in The source tensor + * @param[in] out The destination tensor + * @return N/A + */ + void configure(::arm_compute::ITensor *in, ::arm_compute::ITensor *out); - execute_window_loop(window, - [this](const arm_compute::Coordinates &id) { castData(_in, _out, id); }); - - if (::internal::arm_compute::isGpuMode()) - { - auto &q = ::arm_compute::CLScheduler::get().queue(); - CAST_CL(_out)->unmap(q); - CAST_CL(_in)->unmap(q); - } - } + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ + void run(void) override; +private: + /** + * @brief Cast and copy data from one tensor to another + * + * @param[in] in The source tensor + * @param[out] out The destination tensor + * @param[in] id Coordinates to copy + * @return N/A + */ void castData(::arm_compute::ITensor *in, ::arm_compute::ITensor *out, - const arm_compute::Coordinates &id) - { - switch (in->info()->data_type()) - { - case ::arm_compute::DataType::F32: - { - copyCast(*reinterpret_cast<float *>(in->ptr_to_element(id)), out, id); - break; - } - case ::arm_compute::DataType::S32: - { - copyCast(*reinterpret_cast<int32_t *>(in->ptr_to_element(id)), out, id); - break; - } - case ::arm_compute::DataType::U32: - { - copyCast(*reinterpret_cast<uint32_t *>(in->ptr_to_element(id)), out, id); - break; - } - case ::arm_compute::DataType::QASYMM8: - { - const uint8_t quantizedValue = *(in->ptr_to_element(id)); - copyCast(in->info()->quantization_info().dequantize(quantizedValue), out, id); - break; - } - default: - throw std::runtime_error("Not supported, yet"); - break; - } - } + const arm_compute::Coordinates &id); -private: ::arm_compute::ITensor *_in; ::arm_compute::ITensor *_out; }; diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleDepthToSpace.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleDepthToSpace.cc new file mode 100644 index 000000000..d62a8321b --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleDepthToSpace.cc @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "internal/layers/SimpleDepthToSpace.h" + +#include <arm_compute/runtime/CL/CLScheduler.h> + +void SimpleDepthToSpace::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output, + int32_t block_size, const ::arm_compute::Coordinates &axises) +{ + const auto rank = axises.num_dimensions(); + assert(rank == 4); + for (int i = 0; i < rank; ++i) + { + assert(axises[i] >= 0); + assert(axises[i] < rank); + } + + _input = input; + _output = output; + _block_size = block_size; + _axises = axises; +} + +template <typename T> +inline void DepthToSpace(const ::arm_compute::ITensor *input, + const ::arm_compute::TensorShape &input_shape, int32_t block_size, + ::arm_compute::ITensor *output, + const ::arm_compute::TensorShape &output_shape, + const ::arm_compute::Coordinates &axises) +{ + const int output_batch = output_shape[axises[0]]; + const int output_height = output_shape[axises[1]]; + const int output_width = output_shape[axises[2]]; + const int output_depth = output_shape[axises[3]]; + + for (int out_b = 0; out_b < output_batch; ++out_b) + { + for (int out_h = 0; out_h < output_height; ++out_h) + { + for (int out_w = 0; out_w < output_width; ++out_w) + { + for (int out_d = 0; out_d < output_depth; ++out_d) + { + const int in_b = out_b; + const int in_h = out_h / block_size; + const int in_w = out_w / block_size; + const int in_d = + out_d + ((out_h % block_size) * block_size + out_w % block_size) * output_depth; + + auto input_id = + asARMComputeCoordinates(::arm_compute::Coordinates{in_b, in_h, in_w, in_d}, axises); + auto output_id = asARMComputeCoordinates( + ::arm_compute::Coordinates{out_b, out_h, out_w, out_d}, axises); + + *reinterpret_cast<T *>(output->ptr_to_element(output_id)) = + *reinterpret_cast<T *>(input->ptr_to_element(input_id)); + } + } + } + } +} + +void SimpleDepthToSpace::run() +{ + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_input)->map(q); + CAST_CL(_output)->map(q); + } + + switch (_input->info()->data_type()) + { + case ::arm_compute::DataType::U8: + case ::arm_compute::DataType::QASYMM8: + DepthToSpace<uint8_t>(_input, _input->info()->tensor_shape(), _block_size, _output, + _output->info()->tensor_shape(), _axises); + break; + case ::arm_compute::DataType::F32: + DepthToSpace<float>(_input, _input->info()->tensor_shape(), _block_size, _output, + _output->info()->tensor_shape(), _axises); + break; + default: + ARM_COMPUTE_ERROR("DataType not supported"); + break; + } + + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_input)->unmap(q); + CAST_CL(_output)->unmap(q); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleDepthToSpace.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleDepthToSpace.h new file mode 100644 index 000000000..1032aaa47 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleDepthToSpace.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __SIMPLE_DEPTH_TO_SPACE_H__ +#define __SIMPLE_DEPTH_TO_SPACE_H__ + +#include "internal/arm_compute.h" +#include "internal/arm_compute/Cast.h" + +class SimpleDepthToSpace : public ::arm_compute::IFunction +{ +public: + SimpleDepthToSpace(void) : _input(nullptr), _output(nullptr), _block_size(0), _axises{} + { + // DO NOTHING + } + +public: + /** Initialise input and output + * + * @param[in] input First tensor input. + * @param[out] output Output tensor. + * @param[in] block_size Block size. + * @param[in] axises Axises of rank 4 + */ + void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output, int32_t block_size, + const ::arm_compute::Coordinates &axises = getARMComputeAxises(4)); + + void run() override; + +private: + ::arm_compute::ITensor *_input; + ::arm_compute::ITensor *_output; + int32_t _block_size; + ::arm_compute::Coordinates _axises; +}; + +#endif /*__SIMPLE_DEPTH_TO_SPACE_H__ */ diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc index 089c783c1..ae740bb10 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc @@ -1,3 +1,18 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include "internal/layers/SimpleEmbeddingLookup.h" #include <arm_compute/runtime/CL/CLScheduler.h> @@ -6,7 +21,8 @@ void SimpleEmbeddingLookup::configure(::arm_compute::ITensor *lookups, ::arm_compute::ITensor *values, ::arm_compute::ITensor *output) { - // Assume that verification of operands are already done at Planner::visit() + assert(values->info()->num_dimensions() == output->info()->num_dimensions()); + assert(values->info()->num_dimensions() > 1 && values->info()->num_dimensions() <= 4); _lookups = lookups; _values = values; _output = output; @@ -25,85 +41,62 @@ void SimpleEmbeddingLookup::run() // type of elements of lookups is always integer const int32_t *lookups_buf = reinterpret_cast<int32_t *>(_lookups->buffer()); - const auto values_buf = _values->buffer(); - auto output_buf = _output->buffer(); const auto lookups_info = _lookups->info(); const auto values_info = _values->info(); const auto output_info = _output->info(); - // TODO Refactor below duplicated code! - const auto values_rank = values_info->num_dimensions(); - switch (values_rank) + // NOTE The first dimension's position is always at the end of dimensions. + const auto first_dim_pos = values_info->num_dimensions() - 1; + + const size_t first_dim = values_info->dimension(first_dim_pos); + for (size_t i = 0; i < lookups_info->dimension(0); ++i) { - case 2: - // (H,W) in nnapi -> (W,H) in acl - { - const size_t row_size = values_info->dimension(1); - const size_t row_bytes = values_info->total_size() / row_size; - for (size_t i = 0; i < lookups_info->dimension(0); ++i) - { - if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size) - throw std::runtime_error("Embedding Lookup: index out of bounds."); - - size_t idx = lookups_buf[i]; - size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, idx}); - size_t row_offset_by_i = output_info->offset_element_in_bytes({0, i}); - - unsigned char *sink_addr = output_buf + row_offset_by_i; - unsigned char *source_addr = values_buf + row_offset_by_idx; - memcpy(sink_addr, source_addr, row_bytes); - } - } - break; - case 3: - // (B,H,W) in nnapi -> (W,H,B) in acl - { - const size_t row_size = values_info->dimension(2); - const size_t row_bytes = values_info->total_size() / row_size; - for (size_t i = 0; i < lookups_info->dimension(0); ++i) - { - if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size) - throw std::runtime_error("Embedding Lookup: index out of bounds."); - - size_t idx = lookups_buf[i]; - size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, 0, idx}); - size_t row_offset_by_i = output_info->offset_element_in_bytes({0, 0, i}); - - unsigned char *sink_addr = output_buf + row_offset_by_i; - unsigned char *source_addr = values_buf + row_offset_by_idx; - memcpy(sink_addr, source_addr, row_bytes); - } - } - break; - case 4: - // (N,H,W,C) in nnapi -> (N,C,H,W) in acl - { - const size_t row_size = values_info->dimension(3); - const size_t row_bytes = values_info->total_size() / row_size; - for (size_t i = 0; i < lookups_info->dimension(0); ++i) - { - if (lookups_buf[i] < 0 || lookups_buf[i] >= row_size) - throw std::runtime_error("Embedding Lookup: index out of bounds."); - - size_t idx = lookups_buf[i]; - size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, 0, 0, idx}); - size_t row_offset_by_i = output_info->offset_element_in_bytes({0, 0, 0, i}); - - unsigned char *sink_addr = output_buf + row_offset_by_i; - unsigned char *source_addr = values_buf + row_offset_by_idx; - memcpy(sink_addr, source_addr, row_bytes); - } - } - break; - case 1: - // In this case, shape of values actually is matrix but the height(row size) is 1 in acl. If - // row size is 1, this op is not needed and it means this situtation could be wrong. - throw std::runtime_error("Wrong usage of EmbeddingLookup op!"); - default: - throw std::runtime_error("Not supported rank!"); + if (lookups_buf[i] < 0 || lookups_buf[i] >= first_dim) + throw std::runtime_error("Embedding Lookup: index out of bounds."); } + // If each strides of values and output are different, applied padding size of the two tensors are + // different, therefore, it can not be copied at once. + auto can_copy_at_once = [&]() -> bool { + const auto &values_strides = values_info->strides_in_bytes(); + const auto &output_strides = output_info->strides_in_bytes(); + + for (size_t i = 0; i < first_dim_pos; ++i) + { + if (values_strides[i] != values_strides[i]) + return false; + } + + return true; + }; + + using ::arm_compute::Window; + using ::arm_compute::Iterator; + + size_t copy_bytes; + Window window; + if (can_copy_at_once()) + { + copy_bytes = values_info->total_size() / first_dim; + window.use_tensor_dimensions(output_info->tensor_shape(), first_dim_pos); + } + else + { + copy_bytes = values_info->dimension(0) * values_info->element_size(); + window.use_tensor_dimensions(output_info->tensor_shape(), Window::DimY); + } + + Iterator it(_output, window); + execute_window_loop(window, + [&](const ::arm_compute::Coordinates &id) { + ::arm_compute::Coordinates values_id = id; + const int idx = id[first_dim_pos]; + values_id.set(first_dim_pos, lookups_buf[idx]); + memcpy(it.ptr(), _values->ptr_to_element(values_id), copy_bytes); + }, + it); + if (::internal::arm_compute::isGpuMode()) { auto &q = ::arm_compute::CLScheduler::get().queue(); diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h index 9f2cd977f..fd499437f 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h @@ -1,16 +1,55 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef __SIMPLE_EMBEDDING_LOOKUP_H__ #define __SIMPLE_EMBEDDING_LOOKUP_H__ #include "internal/arm_compute.h" -#include <arm_compute/core/ITensor.h> -#include <arm_compute/runtime/IFunction.h> +/** + * @file SimpleEmbeddingLookup.h + * @brief This file contains SimpleEmbeddingLookup class + * @ingroup COM_AI_RUNTIME + */ + +/** + * @brief Class to run SimpleEmbeddingLookup Layer + */ class SimpleEmbeddingLookup : public ::arm_compute::IFunction { public: + SimpleEmbeddingLookup(void) : _lookups(nullptr), _values(nullptr), _output(nullptr) + { + // DO NOTHING + } + +public: + /** + * @brief Configure the layer + * @param[in] lookups 1D tensor which contains lookup values + * @param[in] values The source tensor + * @param[in] output The destination tensor + * @return N/A + */ void configure(::arm_compute::ITensor *lookups, ::arm_compute::ITensor *values, ::arm_compute::ITensor *output); + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ void run() override; private: diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleHashtableLookupLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleHashtableLookupLayer.cc new file mode 100644 index 000000000..7f8ae2505 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleHashtableLookupLayer.cc @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "internal/layers/SimpleHashtableLookupLayer.h" + +#include <arm_compute/runtime/CL/CLScheduler.h> + +void SimpleHashtableLookupLayer::configure(::arm_compute::ITensor *lookups, + ::arm_compute::ITensor *keys, + ::arm_compute::ITensor *values, + ::arm_compute::ITensor *output, + ::arm_compute::ITensor *hits) +{ + _lookups = lookups; + _keys = keys; + _values = values; + _output = output; + _hits = hits; + _lookup_indices.resize(lookups->info()->dimension(0), -1); +} + +void SimpleHashtableLookupLayer::run() +{ + auto &queue = ::arm_compute::CLScheduler::get().queue(); + if (::internal::arm_compute::isGpuMode()) + { + CAST_CL(_lookups)->map(queue); + CAST_CL(_keys)->map(queue); + CAST_CL(_values)->map(queue); + CAST_CL(_output)->map(queue); + CAST_CL(_hits)->map(queue); + } + + const int32_t *lookups_buf = reinterpret_cast<int32_t *>(_lookups->buffer()); + const int32_t *keys_buf = reinterpret_cast<int32_t *>(_keys->buffer()); + uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer()); + + const auto lookups_info = _lookups->info(); + const auto values_info = _values->info(); + const auto keys_info = _keys->info(); + const auto output_info = _output->info(); + + // NOTE The first dimension's position must be always at the end of dimensions. + const auto first_dim_pos = values_info->num_dimensions() - 1; + const size_t first_dim = values_info->dimension(first_dim_pos); + + std::map<int32_t, size_t> key_map; + const int keys_num = keys_info->dimension(0); + for (size_t key_index = 0; key_index < keys_num; key_index++) + { + key_map[keys_buf[key_index]] = key_index; + } + + const int lookups_num = lookups_info->dimension(0); + for (size_t i = 0; i < lookups_num; ++i) + { + const auto lookup_value = lookups_buf[i]; + const auto it = key_map.find(lookup_value); + if (it != key_map.end()) + { + if (it->second >= first_dim) + throw std::runtime_error("HashTable Lookup: index out of bounds."); + _lookup_indices[i] = it->second; + } + } + + // If each strides of values and output are different, applied padding size of the two tensors are + // different, therefore, it can not be copied at once. + auto can_copy_at_once = [&]() -> bool { + const auto &values_strides = values_info->strides_in_bytes(); + const auto &output_strides = output_info->strides_in_bytes(); + + for (size_t i = 0; i < first_dim_pos; ++i) + { + if (values_strides[i] != values_strides[i]) + return false; + } + + return true; + }; + + using ::arm_compute::Window; + using ::arm_compute::Iterator; + using ::arm_compute::Coordinates; + + size_t copy_bytes; + Window window; + if (can_copy_at_once()) + { + copy_bytes = values_info->total_size() / first_dim; + window.use_tensor_dimensions(output_info->tensor_shape(), first_dim_pos); + } + else + { + copy_bytes = values_info->dimension(0) * values_info->element_size(); + window.use_tensor_dimensions(output_info->tensor_shape(), Window::DimY); + } + + Iterator it(_output, window); + execute_window_loop(window, + [&](const Coordinates &id) { + Coordinates values_id = id; + const int idx = id[first_dim_pos]; + const int lookup_index = _lookup_indices[idx]; + if (lookup_index >= 0) + { + values_id.set(first_dim_pos, lookup_index); + memcpy(it.ptr(), _values->ptr_to_element(values_id), copy_bytes); + hits_buf[lookup_index] = 1; + } + else + { + memset(it.ptr(), 0, copy_bytes); + hits_buf[lookup_index] = 0; + } + }, + it); + + if (::internal::arm_compute::isGpuMode()) + { + CAST_CL(_lookups)->unmap(queue); + CAST_CL(_keys)->unmap(queue); + CAST_CL(_values)->unmap(queue); + CAST_CL(_output)->unmap(queue); + CAST_CL(_hits)->unmap(queue); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleHashtableLookupLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleHashtableLookupLayer.h new file mode 100644 index 000000000..ba9d2ec0d --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleHashtableLookupLayer.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __SIMPLE_HASHTABLE_LOOKUP_H__ +#define __SIMPLE_HASHTABLE_LOOKUP_H__ + +#include "internal/arm_compute.h" + +class SimpleHashtableLookupLayer : public ::arm_compute::IFunction +{ +public: + SimpleHashtableLookupLayer(void) + : _lookups(nullptr), _keys(nullptr), _values(nullptr), _output(nullptr), _hits(nullptr) + { + // DO NOTHING + } + + void configure(::arm_compute::ITensor *lookups, ::arm_compute::ITensor *keys, + ::arm_compute::ITensor *values, ::arm_compute::ITensor *output, + ::arm_compute::ITensor *hits); + + void run() override; + +private: + ::arm_compute::ITensor *_lookups; + ::arm_compute::ITensor *_keys; + ::arm_compute::ITensor *_values; + ::arm_compute::ITensor *_output; + ::arm_compute::ITensor *_hits; + std::vector<int32_t> _lookup_indices; +}; + +#endif /*__SIMPLE_HASHTABLE_LOOKUP_H__ */ diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleNeg.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleNeg.cc new file mode 100644 index 000000000..d3943ad40 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleNeg.cc @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "internal/layers/SimpleNeg.h" + +#include <arm_compute/runtime/CL/CLScheduler.h> + +void SimpleNeg::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output) +{ + _input = input; + _output = output; +} + +void SimpleNeg::run() +{ + auto &queue = ::arm_compute::CLScheduler::get().queue(); + if (::internal::arm_compute::isGpuMode()) + { + CAST_CL(_input)->map(queue); + CAST_CL(_output)->map(queue); + } + + arm_compute::Window window; + window.use_tensor_dimensions(_output->info()->tensor_shape()); + + execute_window_loop(window, [this](const arm_compute::Coordinates &id) { + // NOTE Must be two input tensors of identical type + // Must be output tensor of the same type as input0. + assert(_input->info()->data_type() == _output->info()->data_type()); + + switch (_input->info()->data_type()) + { + case ::arm_compute::DataType::F32: + { + const auto input_value = *reinterpret_cast<float *>(_input->ptr_to_element(id)); + *reinterpret_cast<float *>(_output->ptr_to_element(id)) = -input_value; + break; + } + case ::arm_compute::DataType::S32: + { + const auto input_value = *reinterpret_cast<int32_t *>(_input->ptr_to_element(id)); + *reinterpret_cast<int32_t *>(_output->ptr_to_element(id)) = -input_value; + break; + } + case ::arm_compute::DataType::U32: + { + const auto input_value = *reinterpret_cast<uint32_t *>(_input->ptr_to_element(id)); + *reinterpret_cast<uint32_t *>(_output->ptr_to_element(id)) = -input_value; + break; + } + default: + throw std::runtime_error("Not supported, yet"); + break; + } + }); + + if (::internal::arm_compute::isGpuMode()) + { + CAST_CL(_input)->unmap(queue); + CAST_CL(_output)->unmap(queue); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleNeg.h index cb3f36337..4ca88e7f8 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/PadLayer.h +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleNeg.h @@ -1,41 +1,39 @@ -/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PAD_LAYER_H__
-#define __PAD_LAYER_H__
-
-#include <arm_compute/runtime/CL/CLTensor.h>
-#include <arm_compute/runtime/CL/functions/CLFillBorder.h>
-
-class PadLayer : public ::arm_compute::IFunction
-{
-public:
- void configure(::arm_compute::ICLTensor *input, ::arm_compute::ICLTensor *output,
- unsigned int border_width);
- void run(void) override;
-
-private:
- ::arm_compute::ICLTensor *_input;
- ::arm_compute::ICLTensor *_output;
- int _border_width;
- int _output_height;
- int _output_width;
-
- ::arm_compute::CLFillBorder _fillborderkernel;
- void populateOutput();
-};
-
-#endif // __PAD_LAYER_H__
+/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __SIMPLE_NEG_H__ +#define __SIMPLE_NEG_H__ + +#include "internal/arm_compute.h" + +class SimpleNeg : public ::arm_compute::IFunction +{ +public: + SimpleNeg(void) : _input(nullptr), _output(nullptr) + { + // DO NOTHING + } + + void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output); + + void run() override; + +private: + ::arm_compute::ITensor *_input; + ::arm_compute::ITensor *_output; +}; + +#endif /*__SIMPLE_NEG_H__ */ diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimplePackLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/SimplePackLayer.cc new file mode 100644 index 000000000..2a0a25f0c --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimplePackLayer.cc @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "internal/arm_compute.h" +#include "SimplePackLayer.h" + +void SimplePackLayer::configure(const std::vector<::arm_compute::ICLTensor *> &input_vector, + ::arm_compute::ICLTensor *output, int32_t axis) +{ + uint32_t nr_inputs = input_vector.size(); + uint32_t output_rank = output->info()->num_dimensions(); + const ::arm_compute::PermutationVector pv{1, 2, 0}; + _cl_permuted_vector.resize(nr_inputs); + _cl_permute_vector.resize(nr_inputs); + + _output = output; + // A negative axis implies axis from the end. + // For example, axis = -1 implies the first axis from the end, i.e. axis = Rank - 1. + // Similarly, axis = -2 imples second axis from the end, i.e. axis = Rank - 2. + if (axis < 0) + { + axis += output_rank; + } + _axis = ToARMComputeAxis(output_rank, axis).value(); + _cl_reshape_vector.resize(nr_inputs); + + ::arm_compute::TensorShape subTensor_shape{}; + for (int i = 0; i < output_rank; i++) + { + if (i != _axis) + { + subTensor_shape.set(i, _output->info()->tensor_shape()[i]); + } + else + { + subTensor_shape.set(i, 1); + } + } + + auto subTensor_offset = ::arm_compute::Coordinates{}; + subTensor_offset.set_num_dimensions(output_rank); + + for (int i = 0; i < input_vector.size(); i++) + { + _input_vector.push_back(input_vector[i]); + subTensor_offset[_axis] = i; + auto temp_tensor = std::make_shared<::arm_compute::CLSubTensor>( + CAST_CL(_output), subTensor_shape, subTensor_offset, true); + _sub_tensor_vector.push_back(temp_tensor); + // configure to resize of input tensor in sub tensor offseted, dimension expansion will be + // automatic + _cl_permute_vector[i].configure(CAST_CL(_input_vector[i]), &_cl_permuted_vector[i], pv); + _cl_reshape_vector[i].configure(&_cl_permuted_vector[i], _sub_tensor_vector[i].get()); + _cl_permuted_vector[i].allocator()->allocate(); + } +} + +void SimplePackLayer::run(void) +{ + for (int i = 0; i < _input_vector.size(); i++) + { + _cl_permute_vector[i].run(); + _cl_reshape_vector[i].run(); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimplePackLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimplePackLayer.h new file mode 100644 index 000000000..2c2fc37f2 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimplePackLayer.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __SIMPLE_PACK_LAYER_H__ +#define __SIMPLE_PACK_LAYER_H__ + +#include <arm_compute/runtime/CL/CLTensor.h> +#include <arm_compute/runtime/CL/CLSubTensor.h> +#include <arm_compute/runtime/CL/functions/CLReshapeLayer.h> +#include <arm_compute/runtime/CL/functions/CLPermute.h> + +class SimplePackLayer : public ::arm_compute::IFunction +{ +public: + SimplePackLayer(void) + : _cl_permuted_vector{}, _input_vector{}, _sub_tensor_vector{}, _cl_reshape_vector{}, + _cl_permute_vector{}, _output(nullptr), _axis(0) + { + // DO NOTHING + } + +public: + void configure(const std::vector<::arm_compute::ICLTensor *> &input_vector, + ::arm_compute::ICLTensor *output, int axis); + +public: + void run(void) override; + +private: + std::vector<::arm_compute::CLTensor> _cl_permuted_vector; + std::vector<::arm_compute::ICLTensor *> _input_vector; + std::vector<std::shared_ptr<::arm_compute::CLSubTensor>> _sub_tensor_vector; + std::vector<::arm_compute::CLReshapeLayer> _cl_reshape_vector; + std::vector<::arm_compute::CLPermute> _cl_permute_vector; + ::arm_compute::ICLTensor *_output; + int _axis; +}; + +#endif // __SIMPLE_PACK_LAYER_H__ diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.cc new file mode 100644 index 000000000..64236603f --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.cc @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "internal/layers/SimplePadLayer.h" +#include <arm_compute/runtime/CL/CLScheduler.h> + +namespace +{ +bool validate_arg(const ::arm_compute::ITensor *input, const ::arm_compute::ITensor *output, + const ::arm_compute::ITensor *padding_size, + const ::arm_compute::Coordinates &axises) +{ + const int input_batch = input->info()->tensor_shape()[axises[0]]; + const int input_height = input->info()->tensor_shape()[axises[1]]; + const int input_width = input->info()->tensor_shape()[axises[2]]; + const int input_depth = input->info()->tensor_shape()[axises[3]]; + + const int output_batch = output->info()->tensor_shape()[axises[0]]; + const int output_height = output->info()->tensor_shape()[axises[1]]; + const int output_width = output->info()->tensor_shape()[axises[2]]; + const int output_depth = output->info()->tensor_shape()[axises[3]]; + + auto pad_batch_up = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 0})); + auto pad_batch_down = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({1, 0})); + auto pad_height_top = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 1})); + auto pad_height_bottom = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({1, 1})); + auto pad_width_left = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 2})); + auto pad_width_right = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({1, 2})); + auto pad_depth_front = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 3})); + auto pad_depth_back = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({1, 3})); + + const int padded_batch = input_batch + pad_batch_up + pad_batch_down; + const int padded_height = input_height + pad_height_top + pad_height_bottom; + const int padded_width = input_width + pad_width_left + pad_width_right; + const int padded_depth = input_depth + pad_depth_front + pad_depth_back; + + return (padded_batch == output_batch) && (padded_height == output_height) && + (padded_width == output_width) && (padded_depth == output_depth); +} +} // namespace + +void SimplePadLayer::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output, + ::arm_compute::ITensor *padding_size, + const ::arm_compute::Coordinates &axises) +{ + + const auto rank = axises.num_dimensions(); + assert(rank == 4); + assert(input != nullptr && output != nullptr && padding_size != nullptr); + + for (int i = 0; i < rank; ++i) + { + assert(axises[i] >= 0); + assert(axises[i] < rank); + } + + _input = input; + _output = output; + _padding_size = padding_size; + _axises = axises; +} + +template <typename T> +inline void ApplyPadding(const ::arm_compute::ITensor *input_data, + const ::arm_compute::TensorShape &input_shape, + const ::arm_compute::ITensor *padding_size, + ::arm_compute::ITensor *output_data, + const ::arm_compute::TensorShape &output_shape, + const ::arm_compute::Coordinates &axises, T zero_value) +{ + + assert(validate_arg(input_data, output_data, padding_size, axises) && + "Padded Input shape does not match to output shape"); + + const int input_batch = input_shape[axises[0]]; + const int input_height = input_shape[axises[1]]; + const int input_width = input_shape[axises[2]]; + const int input_depth = input_shape[axises[3]]; + + const int output_batch = output_shape[axises[0]]; + const int output_height = output_shape[axises[1]]; + const int output_width = output_shape[axises[2]]; + const int output_depth = output_shape[axises[3]]; + + // Padding size for Up, Top, Left and Front are required. + auto pad_batch_up = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 0})); + auto pad_height_top = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 1})); + auto pad_width_left = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 2})); + auto pad_depth_front = *reinterpret_cast<const int32_t *>(padding_size->ptr_to_element({0, 3})); + + for (int out_b = 0; out_b < output_batch; ++out_b) + { + for (int out_h = 0; out_h < output_height; ++out_h) + { + for (int out_w = 0; out_w < output_width; ++out_w) + { + for (int out_d = 0; out_d < output_depth; ++out_d) + { + auto output_id = asARMComputeCoordinates( + ::arm_compute::Coordinates{out_b, out_h, out_w, out_d}, axises); + + if (out_b < pad_batch_up || out_b >= (input_batch + pad_batch_up) || + out_h < pad_height_top || out_h >= (input_height + pad_height_top) || + out_w < pad_width_left || out_w >= (input_width + pad_width_left) || + out_d < pad_depth_front || out_d >= (input_depth + pad_depth_front)) + { + *reinterpret_cast<T *>(output_data->ptr_to_element(output_id)) = zero_value; + } + else + { + auto input_id = asARMComputeCoordinates( + ::arm_compute::Coordinates{out_b - pad_batch_up, out_h - pad_height_top, + out_w - pad_width_left, out_d - pad_depth_front}, + axises); + *reinterpret_cast<T *>(output_data->ptr_to_element(output_id)) = + *reinterpret_cast<T *>(input_data->ptr_to_element(input_id)); + } + } + } + } + } +} +void SimplePadLayer::run() +{ + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_input)->map(q); + CAST_CL(_output)->map(q); + CAST_CL(_padding_size)->map(q); + } + + switch (_input->info()->data_type()) + { + case ::arm_compute::DataType::U8: + case ::arm_compute::DataType::QASYMM8: + ApplyPadding<uint8_t>(_input, _input->info()->tensor_shape(), _padding_size, _output, + _output->info()->tensor_shape(), _axises, + _input->info()->quantization_info().offset); + break; + case ::arm_compute::DataType::F32: + ApplyPadding<float>(_input, _input->info()->tensor_shape(), _padding_size, _output, + _output->info()->tensor_shape(), _axises, 0.0f); + break; + default: + ARM_COMPUTE_ERROR("DataType not supported"); + break; + } + + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_input)->unmap(q); + CAST_CL(_output)->unmap(q); + CAST_CL(_padding_size)->unmap(q); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.h new file mode 100644 index 000000000..8cb6659ce --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimplePadLayer.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __SIMPLE_PAD_LAYER_H__ +#define __SIMPLE_PAD_LAYER_H__ + +#include "internal/arm_compute.h" +#include "internal/arm_compute/Cast.h" + +class SimplePadLayer : public ::arm_compute::IFunction +{ +public: + SimplePadLayer(void) : _input(nullptr), _output(nullptr), _padding_size(nullptr), _axises{} + { + // DO NOTHING + } + + void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output, + ::arm_compute::ITensor *padding_size, + const ::arm_compute::Coordinates &axises = getARMComputeAxises(4)); + + void run(void) override; + +private: + ::arm_compute::ITensor *_input; + ::arm_compute::ITensor *_output; + ::arm_compute::ITensor *_padding_size; + ::arm_compute::Coordinates _axises; +}; + +#endif // __SIMPLE_PAD_LAYER_H__ diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSQRT.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleSQRT.cc new file mode 100644 index 000000000..b5b3a0950 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSQRT.cc @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "internal/layers/SimpleSQRT.h" + +#include <arm_compute/runtime/CL/CLScheduler.h> + +void SimpleSQRT::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output) +{ + _input = input; + _output = output; +} + +void SimpleSQRT::run() +{ + auto &queue = ::arm_compute::CLScheduler::get().queue(); + if (::internal::arm_compute::isGpuMode()) + { + CAST_CL(_input)->map(queue); + CAST_CL(_output)->map(queue); + } + + arm_compute::Window window; + window.use_tensor_dimensions(_output->info()->tensor_shape()); + + execute_window_loop(window, [this](const arm_compute::Coordinates &id) { + // NOTE Must be two input tensors of identical type + // Must be output tensor of the same type as input0. + assert(_input->info()->data_type() == _output->info()->data_type()); + + const auto input_value = *reinterpret_cast<float *>(_input->ptr_to_element(id)); + *reinterpret_cast<float *>(_output->ptr_to_element(id)) = sqrt(input_value); + }); + + if (::internal::arm_compute::isGpuMode()) + { + CAST_CL(_input)->unmap(queue); + CAST_CL(_output)->unmap(queue); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSQRT.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleSQRT.h new file mode 100644 index 000000000..b05a9e32e --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSQRT.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __SIMPLE_SQRT_H__ +#define __SIMPLE_SQRT_H__ + +#include "internal/arm_compute.h" + +class SimpleSQRT : public ::arm_compute::IFunction +{ +public: + SimpleSQRT(void) : _input(nullptr), _output(nullptr) + { + // DO NOTHING + } + + void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output); + + void run() override; + +private: + ::arm_compute::ITensor *_input; + ::arm_compute::ITensor *_output; +}; + +#endif /*__SIMPLE_SQRT_H__ */ diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToBatchND.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToBatchND.cc new file mode 100644 index 000000000..f53675b99 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToBatchND.cc @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "internal/layers/SimpleSpaceToBatchND.h" + +#include <arm_compute/runtime/CL/CLScheduler.h> + +void SimpleSpaceToBatchND::configure(::arm_compute::ITensor *input, + ::arm_compute::ITensor *block_size, + ::arm_compute::ITensor *padding_size, + ::arm_compute::ITensor *output) +{ + const auto rank = input->info()->num_dimensions(); + assert(rank == 4); + + _input = input; + _block_size = block_size; + _padding_size = padding_size; + _output = output; +} + +template <typename T> +inline void +SpaceToBatchND(const ::arm_compute::ITensor *input, const ::arm_compute::TensorShape &input_shape, + const ::arm_compute::ITensor *block_size, const ::arm_compute::ITensor *padding_size, + const ::arm_compute::ITensor *output, const ::arm_compute::TensorShape &output_shape, + T zero_value) +{ + const int input_batch = input_shape[3]; + const int input_height = input_shape[1]; + const int input_width = input_shape[0]; + + const int depth = output_shape[2]; + + const int padding_height_left = *reinterpret_cast<int *>(padding_size->ptr_to_element({0, 1})); + const int padding_height_right = *reinterpret_cast<int *>(padding_size->ptr_to_element({1, 1})); + const int padding_width_left = *reinterpret_cast<int *>(padding_size->ptr_to_element({0, 0})); + const int padding_width_right = *reinterpret_cast<int *>(padding_size->ptr_to_element({1, 0})); + const int padded_height = input_height + padding_height_left + padding_height_right; + const int padded_width = input_width + padding_width_left + padding_width_right; + + const int block_size_height = *reinterpret_cast<int *>(block_size->ptr_to_element({1})); + const int block_size_width = *reinterpret_cast<int *>(block_size->ptr_to_element({0})); + + assert(padding_height_left >= 0); + assert(padding_height_right >= 0); + assert(padding_width_left >= 0); + assert(padding_width_right >= 0); + assert(block_size_height >= 1); + assert(block_size_width >= 1); + assert(padded_height % block_size_height == 0); + assert(padded_width % block_size_width == 0); + assert(output->info()->dimension(3) == + input->info()->dimension(3) * (block_size_height * block_size_width)); + + for (int in_b = 0; in_b < input_batch; ++in_b) + { + for (int in_d = 0; in_d < depth; ++in_d) + { + for (int in_h = 0; in_h < padded_height; ++in_h) + { + for (int in_w = 0; in_w < padded_width; ++in_w) + { + const int out_d = in_d; + const int out_h = in_h / block_size_height; + const int out_w = in_w / block_size_width; + const int out_b = + in_b + + ((in_h % block_size_height) * block_size_width + in_w % block_size_width) * + input_batch; + + const ::arm_compute::Coordinates output_id{out_w, out_h, out_d, out_b}; + + if (in_h < padding_height_left || in_h >= (input_height + padding_height_left) || + in_w < padding_width_left || in_w >= (input_width + padding_width_left)) + { + *reinterpret_cast<T *>(output->ptr_to_element(output_id)) = zero_value; + } + else + { + const ::arm_compute::Coordinates input_id{in_w - padding_width_left, + in_h - padding_height_left, in_d, in_b}; + *reinterpret_cast<T *>(output->ptr_to_element(output_id)) = + *reinterpret_cast<T *>(input->ptr_to_element(input_id)); + } + } + } + } + } +} +void SimpleSpaceToBatchND::run() +{ + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_input)->map(q); + CAST_CL(_block_size)->map(q); + CAST_CL(_padding_size)->map(q); + CAST_CL(_output)->map(q); + } + + switch (_input->info()->data_type()) + { + case ::arm_compute::DataType::U8: + case ::arm_compute::DataType::QASYMM8: + SpaceToBatchND<uint8_t>(_input, _input->info()->tensor_shape(), _block_size, _padding_size, + _output, _output->info()->tensor_shape(), + _input->info()->quantization_info().offset); + break; + case ::arm_compute::DataType::F32: + SpaceToBatchND<float>(_input, _input->info()->tensor_shape(), _block_size, _padding_size, + _output, _output->info()->tensor_shape(), 0.0f); + break; + default: + ARM_COMPUTE_ERROR("DataType not supported"); + break; + } + + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_input)->unmap(q); + CAST_CL(_block_size)->unmap(q); + CAST_CL(_padding_size)->unmap(q); + CAST_CL(_output)->unmap(q); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToBatchND.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToBatchND.h new file mode 100644 index 000000000..4af961d34 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToBatchND.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __SIMPLE_SPACE_TO_BATCHND_H__ +#define __SIMPLE_SPACE_TO_BATCHND_H__ + +#include "internal/arm_compute.h" + +class SimpleSpaceToBatchND : public ::arm_compute::IFunction +{ +public: + SimpleSpaceToBatchND(void) + : _input(nullptr), _block_size(nullptr), _padding_size(nullptr), _output(nullptr) + { + // DO NOTHING + } + + /** Initialise input and output + * + * @param[in] input First tensor input. + * @param[in] block_size Block size. + * @param[in] padding_size Padding size. + * @param[out] output Output tensor. + */ + void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *block_size, + ::arm_compute::ITensor *padding_size, ::arm_compute::ITensor *output); + + void run() override; + +private: + ::arm_compute::ITensor *_input; + ::arm_compute::ITensor *_block_size; + ::arm_compute::ITensor *_padding_size; + ::arm_compute::ITensor *_output; +}; + +#endif /*__SIMPLE_SPACE_TO_BATCHND_H__ */ diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc index 682295f81..3519da1f3 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.cc @@ -19,11 +19,8 @@ #include <arm_compute/runtime/CL/CLScheduler.h> void SimpleSpaceToDepth::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output, - int32_t block_size, - const ::arm_compute::Coordinates &axises = {3, 1, 0, 2}) + int32_t block_size, const ::arm_compute::Coordinates &axises) { - assert(input->info()->num_dimensions() == 4); - assert(output->info()->num_dimensions() == 4); const auto rank = axises.num_dimensions(); assert(rank == 4); for (int i = 0; i < rank; ++i) @@ -38,26 +35,10 @@ void SimpleSpaceToDepth::configure(::arm_compute::ITensor *input, ::arm_compute: _axises = axises; } -inline int32_t Offset4D(const ::arm_compute::TensorShape &shape, int32_t b, int32_t h, int32_t w, - int32_t d, const ::arm_compute::Coordinates &axises) -{ - // b, h, w, d >= 0 - size_t indexes[4]; - indexes[axises[0]] = b; - indexes[axises[1]] = h; - indexes[axises[2]] = w; - indexes[axises[3]] = d; - - int32_t offset = indexes[3] * shape[2] * shape[1] * shape[0]; - offset += indexes[2] * shape[1] * shape[0]; - offset += indexes[1] * shape[0]; - offset += indexes[0]; - return offset; -} - template <typename T> -inline void SpaceToDepth(const T *input_data, const ::arm_compute::TensorShape &input_shape, - int32_t block_size, T *output_data, +inline void SpaceToDepth(const ::arm_compute::ITensor *input, + const ::arm_compute::TensorShape &input_shape, int32_t block_size, + ::arm_compute::ITensor *output, const ::arm_compute::TensorShape &output_shape, const ::arm_compute::Coordinates &axises) { @@ -66,16 +47,6 @@ inline void SpaceToDepth(const T *input_data, const ::arm_compute::TensorShape & const int input_width = input_shape[axises[2]]; const int input_depth = input_shape[axises[3]]; - const int output_batch = output_shape[axises[0]]; - const int output_height = output_shape[axises[1]]; - const int output_width = output_shape[axises[2]]; - const int output_depth = output_shape[axises[3]]; - - assert(input_batch == output_batch); - assert(input_height == output_height * block_size); - assert(input_width == output_width * block_size); - assert(input_depth * block_size * block_size == output_depth); - for (int in_b = 0; in_b < input_batch; ++in_b) { for (int in_h = 0; in_h < input_height; ++in_h) @@ -90,10 +61,13 @@ inline void SpaceToDepth(const T *input_data, const ::arm_compute::TensorShape & const int out_d = in_d + ((in_h % block_size) * block_size + in_w % block_size) * input_depth; - const int input_index = Offset4D(input_shape, in_b, in_h, in_w, in_d, axises); - const int output_index = Offset4D(output_shape, out_b, out_h, out_w, out_d, axises); + auto input_id = + asARMComputeCoordinates(::arm_compute::Coordinates{in_b, in_h, in_w, in_d}, axises); + auto output_id = asARMComputeCoordinates( + ::arm_compute::Coordinates{out_b, out_h, out_w, out_d}, axises); - output_data[output_index] = input_data[input_index]; + *reinterpret_cast<T *>(output->ptr_to_element(output_id)) = + *reinterpret_cast<T *>(input->ptr_to_element(input_id)); } } } @@ -110,35 +84,16 @@ void SimpleSpaceToDepth::run() CAST_CL(_output)->map(q); } - auto input_buf = _input->buffer(); - auto output_buf = _output->buffer(); switch (_input->info()->data_type()) { case ::arm_compute::DataType::U8: case ::arm_compute::DataType::QASYMM8: - SpaceToDepth(reinterpret_cast<const uint8_t *>(input_buf), _input->info()->tensor_shape(), - _block_size, reinterpret_cast<uint8_t *>(output_buf), - _output->info()->tensor_shape(), _axises); - break; - case ::arm_compute::DataType::S8: - SpaceToDepth(reinterpret_cast<const int8_t *>(input_buf), _input->info()->tensor_shape(), - _block_size, reinterpret_cast<int8_t *>(output_buf), - _output->info()->tensor_shape(), _axises); - break; - case ::arm_compute::DataType::U32: - SpaceToDepth(reinterpret_cast<const uint32_t *>(input_buf), _input->info()->tensor_shape(), - _block_size, reinterpret_cast<uint32_t *>(output_buf), - _output->info()->tensor_shape(), _axises); - break; - case ::arm_compute::DataType::S32: - SpaceToDepth(reinterpret_cast<const int32_t *>(input_buf), _input->info()->tensor_shape(), - _block_size, reinterpret_cast<int32_t *>(output_buf), - _output->info()->tensor_shape(), _axises); + SpaceToDepth<uint8_t>(_input, _input->info()->tensor_shape(), _block_size, _output, + _output->info()->tensor_shape(), _axises); break; case ::arm_compute::DataType::F32: - SpaceToDepth(reinterpret_cast<const float *>(input_buf), _input->info()->tensor_shape(), - _block_size, reinterpret_cast<float *>(output_buf), - _output->info()->tensor_shape(), _axises); + SpaceToDepth<float>(_input, _input->info()->tensor_shape(), _block_size, _output, + _output->info()->tensor_shape(), _axises); break; default: ARM_COMPUTE_ERROR("DataType not supported"); diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h index f5e028b1c..9e87c364c 100644 --- a/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleSpaceToDepth.h @@ -14,25 +14,44 @@ * limitations under the License. */ +/** + * @file SimpleSpaceToDepth.h + * @brief This file contains SimpleSpaceToDepth class + * @ingroup COM_AI_RUNTIME + */ + #ifndef __SIMPLE_SPACE_TO_DEPTH_H__ #define __SIMPLE_SPACE_TO_DEPTH_H__ #include "internal/arm_compute.h" -#include <arm_compute/core/ITensor.h> -#include <arm_compute/runtime/IFunction.h> +#include "internal/arm_compute/Cast.h" +/** + * @brief Class to run SimpleEmbeddingLookup Layer + */ class SimpleSpaceToDepth : public ::arm_compute::IFunction { public: - /** Initialise input and output - * - * @param[in] input First tensor input. - * @param[out] output Output tensor. - * @param[in] block_size Block size. + SimpleSpaceToDepth(void) : _input(nullptr), _output(nullptr), _block_size(0), _axises{} + { + // DO NOTHING + } + + /** + * @brief Configure the layer + * @param[in] input First tensor input. + * @param[in] output Output tensor. + * @param[in] block_size Block size. + * @param[in] axises Axises of rank 4 + * @return N/A */ void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output, int32_t block_size, - const ::arm_compute::Coordinates &axises); + const ::arm_compute::Coordinates &axises = getARMComputeAxises(4)); + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ void run() override; private: diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleTransposeConv.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleTransposeConv.cc new file mode 100644 index 000000000..abc291289 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleTransposeConv.cc @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "internal/layers/SimpleTransposeConv.h" +#include <arm_compute/runtime/CL/CLScheduler.h> + +void SimpleTransposeConv::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *weights, + ::arm_compute::ITensor *output, + ::arm_compute::PadStrideInfo &tconv_info, + ::arm_compute::Coordinates axises) +{ + auto rank = axises.num_dimensions(); + + assert(rank == 4); + + _input = input; + _weights = weights; + _output = output; + _stride_width = tconv_info.stride().first; + _stride_height = tconv_info.stride().second; + _pad_width = tconv_info.pad_left(); + _pad_height = tconv_info.pad_top(); + _axises = axises; +} + +template <typename T> +inline void ApplyTransposeConv( + const ::arm_compute::TensorShape &input_shape, const ::arm_compute::ITensor *input_data, + const ::arm_compute::TensorShape &filter_shape, const ::arm_compute::ITensor *filter_data, + const ::arm_compute::TensorShape &output_shape, const ::arm_compute::ITensor *output_data, + const int32_t stride_width, const int32_t stride_height, const int32_t pad_width, + const int32_t pad_height, const ::arm_compute::Coordinates axises) +{ + const int batches = input_shape[axises[0]]; + const int input_height = input_shape[axises[1]]; + const int input_width = input_shape[axises[2]]; + const int input_depth = input_shape[axises[3]]; + + const int filter_height = filter_shape[axises[1]]; + const int filter_width = filter_shape[axises[2]]; + + const int output_height = output_shape[axises[1]]; + const int output_width = output_shape[axises[2]]; + const int output_depth = output_shape[axises[3]]; + + assert(batches == output_shape[axises[0]]); + assert(input_depth == filter_shape[axises[3]]); + assert(filter_shape[axises[0]] == output_depth); + + // Although transpose convolution simplifies to convolution with transposed + // weights for strides of 1, non-unitary striding complicates matters. To + // keep this reference implementation as clear as possible, we use a + // "scatter" access pattern, where we loop through all the input elements, + // computing their influence on the output, rather than looping through the + // output elements in the typical "gather" access pattern of a conv. We + // therefore must initialize the output array to zero. + + // Loop through input elements one at a time. + for (int batch = 0; batch < batches; ++batch) + { + for (int in_y = 0; in_y < input_height; ++in_y) + { + for (int in_x = 0; in_x < input_width; ++in_x) + { + for (int in_channel = 0; in_channel < input_depth; ++in_channel) + { + // Loop through the output elements it will influence + const int out_x_origin = (in_x * stride_width) - pad_width; + const int out_y_origin = (in_y * stride_height) - pad_height; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + for (int out_channel = 0; out_channel < output_depth; ++out_channel) + { + // Compute output element location + const int out_x = out_x_origin + filter_x; + const int out_y = out_y_origin + filter_y; + // We cannot accumulate out of bounds + if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) && + (out_y < output_height)) + { + auto input_id = asARMComputeCoordinates( + ::arm_compute::Coordinates{batch, in_y, in_x, in_channel}, axises); + auto filter_id = asARMComputeCoordinates( + ::arm_compute::Coordinates{in_channel, filter_y, filter_x, out_channel}, + axises); + auto output_id = asARMComputeCoordinates( + ::arm_compute::Coordinates{batch, out_y, out_x, out_channel}, axises); + T input_value = *reinterpret_cast<T *>(input_data->ptr_to_element(input_id)); + T filter_value = *reinterpret_cast<T *>(filter_data->ptr_to_element(filter_id)); + *reinterpret_cast<T *>(output_data->ptr_to_element(output_id)) += + input_value * filter_value; + } + } + } + } + } + } + } + } +} + +void SimpleTransposeConv::run() +{ + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_input)->map(q); + CAST_CL(_weights)->map(q); + CAST_CL(_output)->map(q); + } + + switch (_input->info()->data_type()) + { + case ::arm_compute::DataType::S32: + ApplyTransposeConv<int32_t>(_input->info()->tensor_shape(), _input, + _weights->info()->tensor_shape(), _weights, + _output->info()->tensor_shape(), _output, _stride_width, + _stride_height, _pad_width, _pad_height, _axises); + break; + case ::arm_compute::DataType::F32: + ApplyTransposeConv<float>(_input->info()->tensor_shape(), _input, + _weights->info()->tensor_shape(), _weights, + _output->info()->tensor_shape(), _output, _stride_width, + _stride_height, _pad_width, _pad_height, _axises); + break; + default: + ARM_COMPUTE_ERROR("DataType not supported"); + break; + } + + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_input)->unmap(q); + CAST_CL(_weights)->unmap(q); + CAST_CL(_output)->unmap(q); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleTransposeConv.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleTransposeConv.h new file mode 100644 index 000000000..c5519828b --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleTransposeConv.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __TRANSPOSE_CONV_EX__ +#define __TRANSPOSE_CONV_EX__ + +#include "internal/arm_compute.h" +#include "internal/arm_compute/Cast.h" + +class SimpleTransposeConv : public ::arm_compute::IFunction +{ +public: + SimpleTransposeConv() + : _input(nullptr), _weights(nullptr), _output(nullptr), _stride_width(0), _stride_height(0), + _pad_width(0), _pad_height(0) + { + // DO NOTHING + } + + /** Initialise input and output + * + * @param[in] input First tensor input. + * @param[in] weights Weights + * @param[out] output Output tensor. + * @param[in] tc_info Contains padding and policies to be used in the deconvolution, + * this is decribed in @ref PadStrideInfo. + * @param[in] axises Axises of rank 4 + */ + void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *weights, + ::arm_compute::ITensor *output, ::arm_compute::PadStrideInfo &tconv_info, + ::arm_compute::Coordinates axises = getARMComputeAxises(4)); + + void run() override; + +private: + ::arm_compute::ITensor *_input; + ::arm_compute::ITensor *_weights; + ::arm_compute::ITensor *_output; + int32_t _stride_width; + int32_t _stride_height; + int32_t _pad_width; + int32_t _pad_height; + ::arm_compute::Coordinates _axises; +}; + +#endif /*__TRANSPOSE_CONV_EX__ */ diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.cc new file mode 100644 index 000000000..910595a44 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.cc @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "internal/arm_compute.h" +#include "SimpleUnpackLayer.h" + +void SimpleUnpackLayer::configure(::arm_compute::ICLTensor *input, + const std::vector<::arm_compute::ICLTensor *> &output_vector, + int32_t axis) +{ + uint32_t nr_outputs = output_vector.size(); + _cl_permuted_vector.resize(nr_outputs); + _cl_permute_vector.resize(nr_outputs); + uint32_t input_rank = input->info()->num_dimensions(); + const ::arm_compute::PermutationVector pv{2, 0, 1}; + _input = input; + // Negatige axis is supported, -1 implies R-1 axis where R is input rank + if (axis < 0) + { + axis += input_rank; + } + _axis = ToARMComputeAxis(input_rank, axis).value(); + _cl_reshape_vector.resize(nr_outputs); + + ::arm_compute::TensorShape subTensor_shape{}; + for (int i = 0; i < input_rank; i++) + { + if (i != _axis) + { + subTensor_shape.set(i, _input->info()->tensor_shape()[i]); + } + else + { + subTensor_shape.set(i, 1); + } + } + + auto subTensor_offset = ::arm_compute::Coordinates{}; + subTensor_offset.set_num_dimensions(input_rank); + + for (int i = 0; i < output_vector.size(); i++) + { + _output_vector.push_back(output_vector[i]); + subTensor_offset[_axis] = i; + auto temp_tensor = std::make_shared<::arm_compute::CLSubTensor>( + CAST_CL(_input), subTensor_shape, subTensor_offset, true); + _sub_tensor_vector.push_back(temp_tensor); + // Copies into the subtensor + _cl_permute_vector[i].configure(_sub_tensor_vector[i].get(), &_cl_permuted_vector[i], pv); + _cl_reshape_vector[i].configure(&_cl_permuted_vector[i], CAST_CL(_output_vector[i])); + _cl_permuted_vector[i].allocator()->allocate(); + } +} + +void SimpleUnpackLayer::run(void) +{ + for (int i = 0; i < _output_vector.size(); i++) + { + _cl_permute_vector[i].run(); + _cl_reshape_vector[i].run(); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.h new file mode 100644 index 000000000..52fc7513d --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __UNPACK_LAYER_H__ +#define __UNPACK_LAYER_H__ + +#include <arm_compute/runtime/CL/CLTensor.h> +#include <arm_compute/runtime/CL/CLSubTensor.h> +#include <arm_compute/runtime/CL/functions/CLReshapeLayer.h> +#include <arm_compute/runtime/CL/functions/CLPermute.h> + +class SimpleUnpackLayer : public ::arm_compute::IFunction +{ +public: + SimpleUnpackLayer(void) + : _cl_permuted_vector{}, _output_vector{}, _sub_tensor_vector{}, _cl_reshape_vector{}, + _cl_permute_vector{}, _input(nullptr), _axis(0) + { + // DO NOTHING + } + +public: + void configure(::arm_compute::ICLTensor *input, + const std::vector<::arm_compute::ICLTensor *> &output_vector, int32_t axis); + +public: + void run(void) override; + +private: + std::vector<::arm_compute::CLTensor> _cl_permuted_vector; + std::vector<::arm_compute::ICLTensor *> _output_vector; + std::vector<std::shared_ptr<::arm_compute::CLSubTensor>> _sub_tensor_vector; + std::vector<::arm_compute::CLReshapeLayer> _cl_reshape_vector; + std::vector<::arm_compute::CLPermute> _cl_permute_vector; + ::arm_compute::ICLTensor *_input; + int32_t _axis; +}; + +#endif // __UNPACK_LAYER_H__ diff --git a/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.cc b/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.cc deleted file mode 100644 index 3f988a819..000000000 --- a/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.cc +++ /dev/null @@ -1,40 +0,0 @@ -#include "SquaredDifferenceOperation.h" -#include "internal/arm_compute.h" - -void SquaredDifferenceOperation::configure(::arm_compute::ITensor *input1, - ::arm_compute::ITensor *input2, - ::arm_compute::ITensor *output, - ::arm_compute::ConvertPolicy ConvertPolicy, float scale, - ::arm_compute::RoundingPolicy RoundingPolicy) -{ - _input1 = input1; - _input2 = input2; - _output = output; - - if (::internal::arm_compute::isGpuMode()) - { - _cl_sub.configure(CAST_CL(input1), CAST_CL(input2), CAST_CL(output), ConvertPolicy); - _cl_mul.configure(CAST_CL(output), CAST_CL(output), CAST_CL(output), scale, ConvertPolicy, - RoundingPolicy); - } - else - { - _neon_sub.configure(CAST_NE(input1), CAST_NE(input2), CAST_NE(output), ConvertPolicy); - _neon_mul.configure(CAST_NE(output), CAST_NE(output), CAST_NE(output), scale, ConvertPolicy, - RoundingPolicy); - } -} - -void SquaredDifferenceOperation::run(void) -{ - if (::internal::arm_compute::isGpuMode()) - { - _cl_sub.run(); - _cl_mul.run(); - } - else - { - _neon_sub.run(); - _neon_mul.run(); - } -} diff --git a/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.h b/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.h deleted file mode 100644 index 3782c4e8c..000000000 --- a/runtimes/pure_arm_compute/src/internal/layers/SquaredDifferenceOperation.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef __SQUARED_DIFFERENCE_OPERATION_H__ -#define __SQUARED_DIFFERENCE_OPERATION_H__ - -#include <arm_compute/runtime/Tensor.h> -#include <arm_compute/runtime/CL/CLTensor.h> - -#include <arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h> -#include <arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h> -#include <arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h> -#include <arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h> - -class SquaredDifferenceOperation : public ::arm_compute::IFunction -{ -public: - void configure(::arm_compute::ITensor *input1, ::arm_compute::ITensor *input2, - ::arm_compute::ITensor *output, ::arm_compute::ConvertPolicy ConvertPolicy, - float scale, ::arm_compute::RoundingPolicy RoundingPolicy); - -public: - void run(void) override; - -private: - ::arm_compute::ITensor *_input1; - ::arm_compute::ITensor *_input2; - - ::arm_compute::ITensor *_output; - -private: - ::arm_compute::CLArithmeticSubtraction _cl_sub; - ::arm_compute::CLPixelWiseMultiplication _cl_mul; - - ::arm_compute::NEArithmeticSubtraction _neon_sub; - ::arm_compute::NEPixelWiseMultiplication _neon_mul; -}; -#endif // __SQUARED_DIFFERENCE_OPERATION_H__ |