diff options
Diffstat (limited to 'compute/ARMComputeEx/src/runtime')
47 files changed, 5150 insertions, 0 deletions
diff --git a/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp new file mode 100644 index 000000000..158fe0b0c --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/CL/CLFunctionsEx.h" + +// NOTE This empty file aims to validate "CLFunctionsEx.h". +// DO NOT REMOVE this file. diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp new file mode 100644 index 000000000..ae64a6edd --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLArgOperation.h" + +#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +namespace arm_compute +{ + +CLArgOperation::CLArgOperation() +{ + // DO NOTHING +} + +void CLArgOperation::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis, + ArgOperation op) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), axis, output->info(), op)); + _input = input; + _output = output; + _axis = axis; + _arg_op = op; + // NOTE The argminmax_axis must have no duplication. + _num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = _num_of_kernels - 1; + + _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors); + _argop_kernels = + arm_compute::support::cpp14::make_unique<CLArgOperationKernel[]>(_num_of_kernels); + + TensorShape shape{input->info()->tensor_shape()}; + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + shape.set(_axis[i], 1); + _interm_tensors[i].allocator()->init( + TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()) + .set_data_layout(input->info()->data_layout())); + _interm_tensors[i].allocator()->allocate(); + } + + // Set a vector that is ordered ICLTensors sequentially. + std::vector<ICLTensor *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + tensors.emplace_back(_interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Apply ArgMinMax on all kernels + for (size_t i = 0; i < _num_of_kernels; i++) + { + _argop_kernels[i].configure(tensors[i], tensors[i + 1], _axis[i], op); + } +} + +Status CLArgOperation::validate(const ITensorInfo *input, const std::vector<uint32_t> &axis, + const ITensorInfo *output, ArgOperation op) +{ + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - 1; + + // Create temporary tensor infos + auto interm_tensors = + arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors); + + // Create intermediate tensor info + TensorShape shape{input->tensor_shape()}; + + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + shape.set(axis[i], 1); + interm_tensors[i].set_data_type(input->data_type()); + interm_tensors[i].set_tensor_shape(shape); + interm_tensors[i].set_num_channels(input->num_channels()); + } + + // Set a vector that is ordered ITensorInfo sequentially. + std::vector<const ITensorInfo *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + tensors.emplace_back(interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Validate argminmax only on all kernels + for (size_t i = 0; i < num_of_kernels; i++) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLArgOperationKernel::validate(tensors[i], tensors[i + 1], axis[i], op)); + } + + return Status{}; +} + +void CLArgOperation::run() +{ + for (size_t i = 0; i < _num_of_kernels; ++i) + { + CLScheduler::get().enqueue(_argop_kernels[i]); + } +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp new file mode 100644 index 000000000..7c5fe5eda --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h" + +#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op) +{ + auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>(); + k->configure(input1, input2, output, op); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp new file mode 100644 index 000000000..742fc6f59 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLCast.h" + +#include "arm_compute/core/CL/kernels/CLCastKernel.h" + +using namespace arm_compute; + +void CLCast::configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype) +{ + auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>(); + k->configure(input, output, input_subtype); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp new file mode 100644 index 000000000..c2e4ca9ff --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h" + +#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" + +using namespace arm_compute; + +void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size) +{ + auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>(); + k->configure(input, output, block_size); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp new file mode 100644 index 000000000..2781784ca --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h" + +#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" + +using namespace arm_compute; + +void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output, + const ICLTensor *lookups) +{ + auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>(); + k->configure(input, output, lookups); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp new file mode 100644 index 000000000..c6b166163 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h" + +using namespace arm_compute; + +void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *input, + const arm_compute::ICLTensor *weights, + const arm_compute::ICLTensor *biases, + arm_compute::ICLTensor *output, bool needs_reshape, + const arm_compute::TensorShape &reshape) +{ + _input = input; + _weights = weights; + _biases = biases; + _output = output; + _needs_reshape = needs_reshape; + + if (_needs_reshape) + { + // reshape + auto_init_if_empty(*_cl_buffer.info(), + _input->info()->clone()->set_tensor_shape(reshape).set_data_layout( + _input->info()->data_layout())); + _cl_reshape.configure(_input, &_cl_buffer); + + _cl_fc.configure(&_cl_buffer, _weights, _biases, _output); + + // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here. + _cl_buffer.allocator()->allocate(); + } + else + { + _cl_fc.configure(_input, _weights, _biases, _output); + } +} + +void CLFullyConnectedReshapingLayer::run(void) +{ + if (_needs_reshape) + _cl_reshape.run(); + + _cl_fc.run(); +} + +void CLFullyConnectedReshapingLayer::prepare(void) { _cl_fc.prepare(); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp new file mode 100644 index 000000000..6cad9bd2e --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLGatherEx.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLGatherExKernel.h" + +using namespace arm_compute; + +void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, + int axis) +{ + auto k = arm_compute::support::cpp14::make_unique<CLGatherExKernel>(); + k->configure(input, indices, output, axis); + _kernel = std::move(k); +} + +Status CLGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + return CLGatherExKernel::validate(input, indices, output, axis); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp new file mode 100644 index 000000000..7180e9356 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLHashtableLookup.h" + +#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" + +using namespace arm_compute; + +void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys, + const ICLTensor *input, ICLTensor *output, ICLTensor *hits) +{ + auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>(); + k->configure(lookups, keys, input, output, hits); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp new file mode 100644 index 000000000..86ea5a66d --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h" + +#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {} + +void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output, + ICLTensor *gamma, ICLTensor *beta, float epsilon) +{ + auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>(); + k->configure(input, output, gamma, beta, epsilon); + _kernel = std::move(k); +} + +Status CLInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, + float epsilon) +{ + return CLInstanceNormalizationLayerKernelEx::validate(input, output, gamma, beta, epsilon); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp new file mode 100644 index 000000000..be35ea732 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLNeg.h" + +#include "arm_compute/core/CL/kernels/CLNegKernel.h" + +using namespace arm_compute; + +void CLNeg::configure(ICLTensor *input, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp new file mode 100644 index 000000000..38adedd10 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLPReLU.h" + +#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>(); + k->configure(input, alpha, output); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha; + + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp new file mode 100644 index 000000000..2a34c0664 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLRNNLayerEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "support/ToolchainSupport.h" + +#include <utility> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +CLRNNLayerEx::CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), + _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), + _gemm_output(), _add_output(), _is_prepared(false) +{ +} + +Status CLRNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, const ITensorInfo *bias, + const ITensorInfo *hidden_state, const ITensorInfo *output, + const ActivationLayerInfo &info) +{ + const int idx_width = 0; + const int idx_height = 1; + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, + output); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != + recurrent_weights->dimension(idx_width)); + ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != + recurrent_weights->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), + hidden_state->tensor_shape()); + + auto shape_info = + TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, + input->data_type()); + + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f)); + ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate( + ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info)); + + return Status{}; +} + +void CLRNNLayerEx::configure(const ICLTensor *input, const ICLTensor *weights, + const ICLTensor *recurrent_weights, const ICLTensor *bias, + ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); + ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayerEx::validate(input->info(), weights->info(), + recurrent_weights->info(), bias->info(), + hidden_state->info(), output->info(), info)); + + const int idx_height = 1; + TensorShape shape = + compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); + + _is_prepared = false; + + _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + + // Manage intermediate buffers and configure + _memory_group.manage(&_fully_connected_out); + _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out); + + _memory_group.manage(&_gemm_output); + _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f); + + _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + _memory_group.manage(&_add_output); + + _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, + &_add_output, ConvertPolicy::SATURATE); + + _fully_connected_out.allocator()->allocate(); + _gemm_output.allocator()->allocate(); + + _activation_kernel.configure(&_add_output, hidden_state, info); + _add_output.allocator()->allocate(); + + _copy_kernel.configure(hidden_state, output); +} + +void CLRNNLayerEx::run() +{ + prepare(); + + _memory_group.acquire(); + + _fully_connected_kernel.run(); + _gemm_state_f.run(); + CLScheduler::get().enqueue(_add_kernel); + CLScheduler::get().enqueue(_activation_kernel); + + // copy hidden out to output + CLScheduler::get().enqueue(_copy_kernel); + + _memory_group.release(); +} + +void CLRNNLayerEx::prepare() +{ + if (!_is_prepared) + { + _fully_connected_kernel.prepare(); + _gemm_state_f.prepare(); + + _is_prepared = true; + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp new file mode 100644 index 000000000..13a25c901 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLReduceOperation.h" + +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +using namespace arm_compute; + +CLReduceOperation::CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(), + _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape() +{ +} + +Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output, + const std::set<uint32_t> &axis, bool keep_dims, + const ReduceOperation &op) +{ + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0); + + // Create temporary tensor infos + auto interm_tensors = + arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors); + + // Create intermediate tensor info + TensorShape shape{input->tensor_shape()}; + + auto it = axis.begin(); + for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it) + { + shape.set(*it, 1, false); + interm_tensors[i].set_data_type(input->data_type()); + interm_tensors[i].set_tensor_shape(shape); + interm_tensors[i].set_num_channels(input->num_channels()); + interm_tensors[i].set_data_layout(input->data_layout()); + interm_tensors[i].set_quantization_info(input->quantization_info()); + } + + // Set a vector that is ordered ITensorInfo sequentially. + std::vector<const ITensorInfo *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; ++i) + { + tensors.emplace_back(interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Validate ReduceOperation only on all kernels + it = axis.begin(); + for (size_t i = 0; i < num_of_kernels; ++i, ++it) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op)); + } + + if (!keep_dims) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output)); + } + + return Status{}; +} + +void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output, + const std::set<uint32_t> &axis, bool keep_dims, + ReduceOperation op) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, keep_dims, op)); + + _axis = axis; + + _input = input; + _output = output; + _keep_dims = keep_dims; + + // NOTE The axis must have no duplication. + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0); + + _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors); + _reduce_kernels = + arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels); + + // Set a vector that is ordered ICLTensors sequentially. + std::vector<ICLTensor *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; ++i) + { + tensors.emplace_back(_interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Apply ReduceOperation on all kernels + TensorShape shape{input->info()->tensor_shape()}; + auto it = axis.begin(); + for (size_t i = 0; i < num_of_kernels; ++i, ++it) + { + shape.set(*it, 1, false); + if (!keep_dims || i != (num_of_kernels - 1)) + { + _interm_tensors[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape)); + _memory_group.manage(&_interm_tensors[i]); + } + _reduce_kernels[i].configure(tensors[i], tensors[i + 1], *it, op); + if (i != 0) + { + _interm_tensors[i - 1].allocator()->allocate(); + } + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + _reshape.configure(&_interm_tensors[num_of_interm_tensors - 1], output); + _interm_tensors[num_of_interm_tensors - 1].allocator()->allocate(); + } +} + +void CLReduceOperation::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + const size_t num_of_kernels = _axis.size(); + for (size_t i = 0; i < num_of_kernels; ++i) + { + CLScheduler::get().enqueue(_reduce_kernels[i]); + } + + if (!_keep_dims) + { + _reshape.run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp new file mode 100644 index 000000000..c03826891 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLSpaceToBatchND.h" + +#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h" + +using namespace arm_compute; + +void CLSpaceToBatchND::configure(const ICLTensor *input, const ICLTensor *block_size, + const ICLTensor *padding_size, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLSpaceToBatchNDKernel>(); + k->configure(input, block_size, padding_size, output); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp new file mode 100644 index 000000000..0f455f96f --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h" + +#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" + +using namespace arm_compute; + +void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size) +{ + auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>(); + k->configure(input, output, block_size); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp new file mode 100644 index 000000000..80d50ad94 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLTopKV2.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "arm_compute/core/CL/ICLTensor.h" + +#include "../../topk_v2.h" + +namespace arm_compute +{ + +CLTopKV2::CLTopKV2() + : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0), + _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(), + _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(), + _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr), + _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(), + _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(), + _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(), + _reorder_negatives_kernel(), _store_kernel()*/ +{ +} + +void CLTopKV2::configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices, + int total_bits, int bits) +{ + _total_bits = total_bits; + _bits = bits; + _n = input->info()->tensor_shape()[0]; + + // _total_bits should be divided by _bits. + ARM_COMPUTE_ERROR_ON((_total_bits % _bits) != 0); + + _k = k; + _radix = 1 << bits; + + _input = input; + _values = values; + _indices = indices; + + std::string topk_env; + +// Disable GPU implementation +// TODO Enable GPU implementation with verification, or remove code +// Invalid result on GPU +#if 0 + char *env = getenv("ACL_TOPKV2"); + if (env) + topk_env = env; + + if (topk_env == "GPU_SINGLE") + { + _qs_idx_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + _qs_temp_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + + _qs_kernel.configure(input, values, indices, &_qs_idx_buf, &_qs_temp_buf, k, _n); + } + else if (topk_env == "GPU") + { + // n should be divided by (_GROUPS * _ITEMS) + ARM_COMPUTE_ERROR_ON((_n % (_GROUPS * _ITEMS)) != 0); + + _hist_buf_size = _radix * _GROUPS * _ITEMS; + _glob_sum_buf_size = _HISTOSPLIT; + + _hist_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _hist_buf_size); + _glob_sum_buf = + cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _glob_sum_buf_size); + _temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _glob_sum_buf_size); + _first_negative_idx_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int)); + _in_key_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n); + _out_key_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n); + _in_ind_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + _out_ind_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + + _p_in_key_buf = &_in_key_buf; + _p_out_key_buf = &_out_key_buf; + _p_in_ind_buf = &_in_ind_buf; + _p_out_ind_buf = &_out_ind_buf; + + _init_kernel.configure(input, _p_in_key_buf, _p_in_ind_buf, _n); + _hist_kernel.configure(&_hist_buf, bits, _n); + _scan_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits); + _glob_scan_hist_kernel.configure(&_glob_sum_buf, &_temp_buf, bits); + _paste_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits); + _reorder_kernel.configure(&_hist_buf, bits, _n); + _find_first_negative_kernel.configure(&_first_negative_idx_buf, _n); + _reorder_negatives_kernel.configure(&_first_negative_idx_buf, _n); + _store_kernel.configure(values, indices, k, _n); + } + else +#endif // Disable GPU implementation + { + // DO NOTHING for CPU. + } +} + +void CLTopKV2::run() +{ + std::string topk_env; +#if 0 + char *env = getenv("ACL_TOPKV2"); + if (env) + topk_env = env; + + if (topk_env == "GPU_SINGLE") + { + run_on_gpu_single_quicksort(); + } + else if (topk_env == "GPU") + { + run_on_gpu(); + } + else +#endif + { + run_on_cpu(); + } +} + +#if 0 +void CLTopKV2::run_on_gpu_single_quicksort() +{ + // This is a single threaded quick sort implementation. + CLScheduler::get().enqueue(_qs_kernel, false); + + arm_compute::CLScheduler::get().sync(); +} + +void CLTopKV2::run_on_gpu() +{ + cl::CommandQueue q = CLScheduler::get().queue(); + + // 1. CLTopKV2Init set key buffer and index buffer. + // - Key buffer is set as the same value of the layer's input + // - Values in the index buffer are set as their indices. + CLScheduler::get().enqueue(_init_kernel, false); + + int n_passes = _total_bits / _bits; + + // 2. Repeat (total_bits/bits) times. + // - total_bits is the number of bits of the data type (e.g., 32 for float) + // - bits defines number of buckets (e.g. 16 buckets where bit is 4) + for (int pass = 0; pass < n_passes; ++pass) + { + arm_compute::CLScheduler::get().sync(); + + // 2.1. Calculate histogram with _GROUPS * _ITEMS threads + _hist_kernel.setPass(pass, _p_in_key_buf); + CLScheduler::get().enqueue(_hist_kernel, false); + + // 2.2. Calculate prefix sum locally with multiple threads + CLScheduler::get().enqueue(_scan_hist_kernel, false); + // 2.3. Calculate prefix sum within a work group + CLScheduler::get().enqueue(_glob_scan_hist_kernel, false); + // 2.4. Calculate global prefix sum + CLScheduler::get().enqueue(_paste_hist_kernel, false); + + // 2.5. Reorder keys and indices based on the global prefix sum + _reorder_kernel.setPass(pass, _p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, _p_out_ind_buf); + CLScheduler::get().enqueue(_reorder_kernel, false); + + cl::Buffer *tmp; + // swap key buffers + tmp = _p_in_key_buf; + _p_in_key_buf = _p_out_key_buf; + _p_out_key_buf = tmp; + + // swap index buffers + tmp = _p_in_ind_buf; + _p_in_ind_buf = _p_out_ind_buf; + _p_out_ind_buf = tmp; + } + + // 3. Get the first negative index + // Because we swap in_buf and out_buf at the end of the above for loop, + // the output buffers are in bufs. + _find_first_negative_kernel.setOutputBuffer(_p_in_key_buf); + CLScheduler::get().enqueue(_find_first_negative_kernel, false); + + // 4. Correct odering of negatives + // - Since radix sort does not consider negatives, negatives are considered as bigger values + // than positives. + // reordered data will be stored in _p_out_key_buf and _p_out_ind_buf + _reorder_negatives_kernel.setBuffers(_p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, + _p_out_ind_buf); + CLScheduler::get().enqueue(_reorder_negatives_kernel, false); + + // 5. Extract top k values from sorted keys and indices. + _store_kernel.setOutputBuffers(_p_out_key_buf, _p_out_ind_buf); + CLScheduler::get().enqueue(_store_kernel, false); + + arm_compute::CLScheduler::get().sync(); + +#if 0 + // below code is left for debugging. + int first_neg; + q.enqueueReadBuffer(_first_negative_idx_buf, CL_TRUE, 0, sizeof(cl_int), &first_neg); + std::cout << "first neg = " << first_neg << std::endl; + + float in_key[_n]; + q.enqueueReadBuffer(*_p_in_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, in_key); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "in_key[" << i << "] = " << in_key[i] << std::endl; + } + + float out_key[_n]; + q.enqueueReadBuffer(*_p_out_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, out_key); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "out_key[" << i << "] = " << out_key[i] << std::endl; + } + + int in_ind[_n]; + q.enqueueReadBuffer(*_p_in_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, in_ind); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "in_ind[" << i << "] = " << in_ind[i] << std::endl; + } + + int out_ind[_n]; + q.enqueueReadBuffer(*_p_out_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, out_ind); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "out_ind[" << i << "] = " << out_ind[i] << std::endl; + } + + int hist_buf[_hist_buf_size]; + q.enqueueReadBuffer(_hist_buf, CL_TRUE, 0, sizeof(cl_int)*_hist_buf_size, hist_buf); + for(uint32_t i = 0 ; i < _hist_buf_size; ++i) { + std::cout << "hist_buf[" << i << "] = " << hist_buf[i] << std::endl; + } + + int glob_sum_buf[_glob_sum_buf_size]; + q.enqueueReadBuffer(_glob_sum_buf, CL_TRUE, 0, sizeof(cl_int)*_glob_sum_buf_size, glob_sum_buf); + for(uint32_t i = 0 ; i < _glob_sum_buf_size; ++i) { + std::cout << "glob_sum_buf[" << i << "] = " << glob_sum_buf[i] << std::endl; + } + +#endif +} +#endif // Disable GPU implementation + +void CLTopKV2::run_on_cpu() +{ + cl::CommandQueue q = CLScheduler::get().queue(); + // const Window& w = _topkv2_kernel.window(); + + _input->map(q); + _values->map(q); + _indices->map(q); + + // int row_size = (w[0].end() - w[0].start()) / w[0].step(); + int row_size = _input->info()->tensor_shape()[0]; + int rank = _input->info()->num_dimensions(); + + if (rank > 2) + throw std::runtime_error("Not supported type."); + + int row_num = (rank == 2 ? _input->info()->tensor_shape()[1] : 1); + + if (_input->info()->data_type() == DataType::F32) + { + nnfw::rt::optimized_ops::TopK<float>(row_size, row_num, (float *)_input->buffer(), _k, + (int32 *)_indices->buffer(), (float *)_values->buffer()); + } + else if (_input->info()->data_type() == DataType::S32) + { + nnfw::rt::optimized_ops::TopK<int32_t>(row_size, row_num, (int32_t *)_input->buffer(), _k, + (int32 *)_indices->buffer(), + (int32_t *)_values->buffer()); + } + else if (_input->info()->data_type() == DataType::QASYMM8) + { + nnfw::rt::optimized_ops::TopK<uint8_t>(row_size, row_num, (uint8_t *)_input->buffer(), _k, + (int32 *)_indices->buffer(), + (uint8_t *)_values->buffer()); + } + else + { + throw std::runtime_error("Not supported type."); + } + + _input->unmap(q); + _values->unmap(q); + _indices->unmap(q); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp new file mode 100644 index 000000000..40e21671d --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/UtilsEx.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CPP/CPPScheduler.h" + +#include <memory> +#include <tuple> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _scale_f(), + _conv_f(), + _flip_weights(), + _scaled_output(), + _original_weights(nullptr), + _weights_flipped(), + _is_prepared(false) +{ +} + +Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); + + const DataLayout data_layout = input->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1); + + const unsigned int kernel_x = weights->dimension(idx_w); + const unsigned int kernel_y = weights->dimension(idx_h); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_right > kernel_x - 1, + "invalid_right must be smaller than kernel_x"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_bottom > kernel_y - 1, + "inner_border_top must be smaller than kernel_y"); + + // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were added. + auto out_dims = transposeconv_output_dimensions( + input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), + weights->dimension(idx_h), info, invalid_right, invalid_bottom); + + const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights); + + if (bias != nullptr) + { + if (is_data_type_quantized_asymmetric(input->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], + "Output's width is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], + "Output's height is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], + "Output's depth is invalid."); + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, + pad_bottom); + TensorInfo scale_out_info(input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(scale_out_shape) + .set_data_layout(data_layout)); + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + ARM_COMPUTE_RETURN_ON_ERROR( + CLTransposeConvLayerUpsample::validate(input, &scale_out_info, BorderSize(0, 0), info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, + conv_info, weights_info)); + + return Status{}; +} + +void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, + ICLTensor *output, const PadStrideInfo &info, + unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + + const DataLayout data_layout = input->info()->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + _original_weights = weights; + _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); + _flip_weights.configure(weights, &_weights_flipped); + + // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were + // added. + auto out_dims = transposeconv_output_dimensions( + input->info()->dimension(idx_w), input->info()->dimension(idx_h), + weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right, + invalid_bottom); + + const TensorShape output_shape = + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + + // Output auto initialization if not yet initialized + auto_init_if_empty( + *output->info(), + input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayer::validate( + input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), + info, invalid_right, invalid_bottom)); + + _is_prepared = weights_info.retain_internal_weights(); + + _memory_group.manage(&_scaled_output); + + // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order + // to match output shape + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); + + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + scale_out_info.set_data_layout(data_layout); + _scaled_output.allocator()->init(scale_out_info); + + // configure scale function + const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, + DimensionRoundingType::FLOOR); + _scale_f.configure(input, &_scaled_output, BorderSize(0, 0), upsample_info); + + // setup the function to convolve the upscaled output + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info); + _scaled_output.allocator()->allocate(); +} + +void CLTransposeConvLayer::run() +{ + prepare(); + + _memory_group.acquire(); + + _scale_f.run(); + _conv_f.run(); + + _memory_group.release(); +} + +void CLTransposeConvLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights flipping and mark original weights tensor as unused + _weights_flipped.allocator()->allocate(); + _weights_flipped.map(true); + _original_weights->map(CLScheduler::get().queue(), true); + CPPScheduler::get().schedule(&_flip_weights, Window::DimZ); + _weights_flipped.unmap(); + _original_weights->unmap(CLScheduler::get().queue()); + _original_weights->mark_as_unused(); + + // Prepare convolution + _conv_f.prepare(); + + if (!_weights_flipped.is_used()) + { + _weights_flipped.allocator()->free(); + } + + _is_prepared = true; + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp new file mode 100644 index 000000000..0ce3e6700 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h" + +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include <cmath> +#include <memory> +#include <tuple> + +using namespace arm_compute; + +CLTransposeConvLayerUpsample::CLTransposeConvLayerUpsample() // NOLINT + : _upsample(), + _output(nullptr) +{ +} + +Status CLTransposeConvLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, + const BorderSize &inner_border, + const PadStrideInfo &info) +{ + return CLTransposeConvLayerUpsampleKernel::validate(input, output, inner_border, info); +} + +void CLTransposeConvLayerUpsample::configure(ICLTensor *input, ICLTensor *output, + const BorderSize &inner_border, + const PadStrideInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + _output = output; + _upsample.configure(input, _output, inner_border, info); +} + +void CLTransposeConvLayerUpsample::run() +{ + _output->map(CLScheduler::get().queue(), true); + if (is_data_type_quantized_asymmetric(_output->info()->data_type())) + { + const uint8_t quantized_zero = _output->info()->quantization_info().offset; + std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero); + } + else + { + memset(_output->buffer(), 0, _output->info()->total_size()); + } + _output->unmap(CLScheduler::get().queue()); + + CLScheduler::get().enqueue(_upsample, false); +} diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp new file mode 100644 index 000000000..f8e0ef8a6 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h" + +#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +void CPPUpsampleEx::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info) +{ + auto k = arm_compute::support::cpp14::make_unique<CPPUpsampleKernelEx>(); + k->configure(input, output, info); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp new file mode 100644 index 000000000..80fbf359d --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/NEON/NEFunctionsEx.h" + +// NOTE This empty file aims to validate "NEFunctionsEx.h". +// DO NOT REMOVE this file. diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp new file mode 100644 index 000000000..5ba465b61 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEArgMinMax.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ + +template <ReductionOperation OP> +NEArgMinMaxStatic<OP>::NEArgMinMaxStatic(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernel(), _reduced_out(), _reshape() +{ +} + +template <ReductionOperation OP> +Status NEArgMinMaxStatic<OP>::validate(const ITensorInfo *input, int axis, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + + TensorShape out_shape = input->tensor_shape(); + const int input_dims = input->num_dimensions(); + int axis_local = axis; + + // Convert negative axis + axis_local = wrap_around(axis_local, input_dims); + + ARM_COMPUTE_RETURN_ERROR_ON(axis_local > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local) > input->num_dimensions() - 1); + out_shape.remove_dimension(axis_local); + + const TensorInfo out_info = output->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + + return Status{}; +} + +template <ReductionOperation OP> +void NEArgMinMaxStatic<OP>::configure(ITensor *input, int axis, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + int axis_local = axis; + const int input_dims = input->info()->num_dimensions(); + + // Convert negative axis + axis_local = wrap_around(axis_local, input_dims); + + // Perform reduction for axis + TensorShape intermediate_shape = input->info()->tensor_shape(); + intermediate_shape.set(axis_local, 1); + auto in = input; + + _reduced_out.allocator()->init(TensorInfo(intermediate_shape, output->info()->num_channels(), + output->info()->data_type(), + output->info()->quantization_info())); + _memory_group.manage(&_reduced_out); + _reduction_kernel.configure(in, axis_local, &_reduced_out, OP); + + // Allocate intermediate tensor + _reduced_out.allocator()->allocate(); + + // Configure reshape layer if we want to drop the dimensions + TensorShape out_shape = input->info()->tensor_shape(); + out_shape.remove_dimension(axis_local); + auto_init_if_empty(*output->info(), output->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(&_reduced_out, output); +} + +template <ReductionOperation OP> void NEArgMinMaxStatic<OP>::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + _reduction_kernel.run(); + _reshape.run(); +} + +// Supported Specializations +template class NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MAX>; +template class NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MIN>; +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp new file mode 100644 index 000000000..7c15fc453 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h" +#include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h> + +#include "arm_compute/core/ITensor.h" +#include "support/ToolchainSupport.h" + +#include <utility> + +namespace arm_compute +{ + +template <BinaryLogicalOperation COP> +void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2, + ITensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>(); + k->configure(COP, input1, input2, output); + _kernel = std::move(k); +} + +template <BinaryLogicalOperation COP> +Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output) +{ + return NEBinaryLogicalOperationKernel::validate(COP, input1, input2, output); +} + +void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output, + BinaryLogicalOperation op) +{ + auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>(); + k->configure(op, input1, input2, output); + _kernel = std::move(k); +} + +Status NEBinaryLogicalOperation::validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, BinaryLogicalOperation op) +{ + return NEBinaryLogicalOperationKernel::validate(op, input1, input2, output); +} + +// Supported Specializations +template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>; +template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>; +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp new file mode 100644 index 000000000..f2490e4e8 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NECast.h" + +#include "arm_compute/core/NEON/kernels/NECastKernel.h" +#include "support/ToolchainSupport.h" + +namespace arm_compute +{ +void NECast::configure(const ITensor *input, ITensor *output, SubDataType input_subtype) +{ + auto k = arm_compute::support::cpp14::make_unique<NECastKernel>(); + k->configure(input, output, input_subtype); + _kernel = std::move(k); +} + +Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output, + SubDataType input_subtype) +{ + return NECastKernel::validate(input, output, input_subtype); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp new file mode 100644 index 000000000..db419e3a8 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +namespace arm_compute +{ +void NEDepthToSpaceLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape) +{ + auto k = arm_compute::support::cpp14::make_unique<NEDepthToSpaceLayerKernelEx>(); + k->configure(input, output, block_shape); + _kernel = std::move(k); +} + +Status NEDepthToSpaceLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + int32_t block_shape) +{ + return NEDepthToSpaceLayerKernelEx::validate(input, output, block_shape); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp new file mode 100644 index 000000000..a95018a28 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h" + +#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h" +#include "support/ToolchainSupport.h" + +#include <utility> + +namespace arm_compute +{ +void NENegLayer::configure(const ITensor *input, ITensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernelEx>(); + k->configure(ElementWiseUnaryEx::NEG, input, output); + _kernel = std::move(k); +} +Status NENegLayer::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx::NEG, input, output); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp new file mode 100644 index 000000000..00c3ed94f --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h" + +#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups) +{ + auto k = arm_compute::support::cpp14::make_unique<NEEmbeddingLookupKernel>(); + k->configure(input, output, lookups); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp new file mode 100644 index 000000000..d604fedbf --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include <algorithm> +#include <cmath> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) +{ + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output)); + + return Status{}; +} +} // namespace + +void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} + +Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input, + const ITensorInfo *output) +{ + return NETransposeKernel::validate(input, output); +} + +NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer( + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(), + _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(), + _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false), + _accumulate_biases(false), _is_prepared(false) +{ +} + +void NEFullyConnectedHybridLayer::configure_mm(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); + + // Configure gemmlowp function + _mm_gemmlowp.configure(input, weights, nullptr, output); +} + +void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor *weights, + const ITensor *biases, ITensor *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedHybridLayer::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); + + _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + _accumulate_biases = false; + _original_weights = weights; + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr) + { + _accumulate_biases = true; + + // Configure accumulate biases kernel + _accumulate_biases_kernel.configure(output, biases); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensor *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->info()->dimension(1) > 1; + bool _is_fc_after_conv; + if (is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = input->info()->num_dimensions() > 1 && input->info()->dimension(1) > 1; + } + ARM_COMPUTE_ERROR_ON_MSG(_is_fc_after_conv, + "NEFullyConnectedHybridLayer does not support after conv"); + (void)_is_fc_after_conv; + + // Reshape weights if needed + if (!_are_weights_reshaped) + { + // Reshape the weights + _reshape_weights_output.allocator()->init( + weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights->info()))); + _reshape_weights_function.configure(weights_to_use, &_reshape_weights_output); + weights_to_use = &_reshape_weights_output; + } + + // Quantize input + _quantized_input.allocator()->init( + input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8)); + _scale_factor.allocator()->init( + TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32)); + _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor); + + // GEMM + _gemmlowp_output.allocator()->init( + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output); + + // Multiply scale + _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output, + weights->info()->quantization_info().scale); + + _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights; + + _quantized_input.allocator()->allocate(); + _scale_factor.allocator()->allocate(); + _gemmlowp_output.allocator()->allocate(); +} + +Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + + const ITensorInfo &reshaped_weights = + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases)); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *weights_to_use = weights; + + if (!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR( + NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); + + // Validate quantization kernel + const ITensorInfo &quantized_input = TensorInfo( + input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8)); + const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32); + ARM_COMPUTE_RETURN_ON_ERROR( + NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor)); + + const ITensorInfo &gemmlowp_output = TensorInfo( + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate( + &gemmlowp_output, &scale_factor, output, weights->quantization_info().scale)); + + return Status{}; +} + +void NEFullyConnectedHybridLayer::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + // Quantize input + NEScheduler::get().schedule(&_quant_input_kernel, Window::DimY); + + // Run matrix multiply + _mm_gemmlowp.run(); + + // Multiply scale factor + NEScheduler::get().schedule(&_multiply_scale_kernel, Window::DimY); + + // Accumulate biases if provided + if (_accumulate_biases) + { + NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY); + } +} + +void NEFullyConnectedHybridLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + auto release_unused = [](Tensor *w) { + if (!w->is_used()) + { + w->allocator()->free(); + } + }; + + // Reshape of the weights (happens only once) + if (!_are_weights_reshaped) + { + // Run reshape weights kernel and mark weights as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_function.run(); + + _are_weights_reshaped = true; + // We can not release _original_weights because it can be used in other nodes + } + + // Prepare GEMM prepare and release unused weights + _mm_gemmlowp.prepare(); + + // Release reshaped weights if unused + release_unused(&_reshape_weights_output); + + _is_prepared = true; + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp new file mode 100644 index 000000000..a944f699a --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp @@ -0,0 +1,477 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include <algorithm> +#include <cmath> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) +{ + if (is_data_type_quantized_asymmetric(input.data_type())) + { + // Since we need negative offsets for computing convolution, we need to change + // QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info(input.quantization_info().scale, + -input.quantization_info().offset); + const QuantizationInfo weights_quantization_info(weights.quantization_info().scale, + -weights.quantization_info().offset); + + // Validate gemmlowp function + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate( + &input.clone()->set_quantization_info(input_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate( + &input, &weights, nullptr, &output, 1.f, 0.0f, + GEMMInfo(false, false, false /* Reshape weights only for the first run */))); + } + + return Status{}; +} +} // namespace + +NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(), + _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), + _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), + _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), + _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false), + _accumulate_biases(false), _is_quantized(false), _is_prepared(false) +{ +} + +void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + if (_is_quantized) + { + // Since we need negative offsets for computing convolution, we need to change + // QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info = input->info()->quantization_info(); + const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); + + input->info()->set_quantization_info( + QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset)); + weights->info()->set_quantization_info( + QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset)); + + // Configure gemmlowp function + _mm_gemmlowp.configure(input, weights, nullptr, output); + + // Revert back QuantizatioInfo as input and weights could be used in other fully connected + // layers + input->info()->set_quantization_info(input_quantization_info); + weights->info()->set_quantization_info(weights_quantization_info); + } + else + { + // Configure matrix multiply kernel + _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, + GEMMInfo(false, false, false /* Reshape weights only for the first run */)); + } +} + +void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON( + (weights->info()->dimension(1) != + (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); + + // If the fully connected layer is called after a convolution layer, the input tensor must be + // linearized + + // Initialize output tensor for flatten + TensorShape shape_flatten = compute_flatten_shape(input->info()); + _flatten_output.allocator()->init( + input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + shape_flatten)); + + // Configure flatten kernel + _memory_group.manage(&_flatten_output); + _flatten_kernel.configure(input, &_flatten_output); + + // Configure matrix multiply kernel + configure_mm(&_flatten_output, weights, output); + + // Allocate the output tensor for flatten once all the configure methods have been called + _flatten_output.allocator()->allocate(); +} + +void NEFullyConnectedLayerEx::configure_fc_fc(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); + + // Configure matrix multiply kernel + configure_mm(input, weights, output); +} + +void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *weights, + const ITensor *biases, ITensor *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); + + _are_weights_converted = true; + _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + _is_fc_after_conv = true; + _accumulate_biases = false; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _original_weights = weights; + + // Configure gemmlowp output + if (_is_quantized) + { + _gemmlowp_output.allocator()->init( + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::S32)); + } + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr && !_is_quantized) + { + _accumulate_biases = true; + + // Configure accumulate biases kernel + _accumulate_biases_kernel.configure(output, biases); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensor *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->info()->dimension(1) > 1; + if (is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = input->info()->num_dimensions() > 1; + } + + // Reshape weights if needed + if (!_are_weights_reshaped) + { + // Reshape the weights + _reshape_weights_function.configure(weights, &_reshape_weights_output); + weights_to_use = &_reshape_weights_output; + } + + // Convert weights if needed + if (_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout)) + { + // Convert weights + _convert_weights.configure(weights_to_use, &_converted_weights_output, + input->info()->tensor_shape(), fc_info.weights_trained_layout); + + weights_to_use = &_converted_weights_output; + _are_weights_converted = false; + } + + ITensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output; + if (_is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + configure_conv_fc(input, weights_to_use, tmp_output); + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + configure_fc_fc(input, weights_to_use, tmp_output); + } + + // Configure output stage for asymmetric quantized types + if (_is_quantized) + { + float multiplier = input->info()->quantization_info().scale * + weights->info()->quantization_info().scale / + output->info()->quantization_info().scale; + int output_multiplier; + int output_shift; + quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, + &output_shift); + _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, + output_shift, output->info()->quantization_info().offset); + _gemmlowp_output.allocator()->allocate(); + } + + _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights; +} + +Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + bool is_fc_after_conv = true; + bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); + + const ITensorInfo &flatten_input = + TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_flatten_shape(input))); + const ITensorInfo &reshaped_weights = + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = + weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); + const ITensorInfo &gemmlowp_output = TensorInfo( + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr && !is_quantized) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases)); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *input_to_use = input; + const ITensorInfo *weights_to_use = weights; + const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->dimension(1) > 1; + + if (is_batched_fc_layer) + { + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(), + output->tensor_shape().cbegin() + 1)); + } + else + { + is_fc_after_conv = input->num_dimensions() > 1; + } + + if (!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR( + NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + if (is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout)) + { + // Validate convert weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate( + weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); + weights_to_use = &converted_weights; + } + + if (is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON( + (weights_to_use->dimension(1) != + (input->dimension(0) * input->dimension(1) * input->dimension(2)))); + + // Validate flatten kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input)); + input_to_use = &flatten_input; + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); + } + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output)); + + // Validate output stage for asymmetric quantized types + if (is_quantized) + { + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate( + &gemmlowp_output, biases, output)); + } + + return Status{}; +} + +void NEFullyConnectedLayerEx::run() +{ + if (!_is_prepared) + { + if (!_are_weights_reshaped) + _reshape_weights_output.allocator()->allocate(); + if (!_are_weights_converted) + _converted_weights_output.allocator()->allocate(); + _is_prepared = true; + } + + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Reshape of the weights + if (!_are_weights_reshaped) + { + _reshape_weights_function.run(); + } + + // Convert weights if needed + if (!_are_weights_converted) + { + _convert_weights.run(); + } + + // Prepare GEMM prepare + if (!_is_quantized) + { + _mm_gemm.prepare(); + } + } + + MemoryGroupResourceScope scope_mg(_memory_group); + + // Linearize input if it comes from a convolutional layer + if (_is_fc_after_conv) + { + NEScheduler::get().schedule(&_flatten_kernel, Window::DimY); + } + + // Run matrix multiply + if (_is_quantized) + { + _mm_gemmlowp.run(); + } + else + { + _mm_gemm.run(); + } + + // Accumulate biases if provided + if (_is_quantized) + { + _gemmlowp_output_stage.run(); + } + else + { + if (_accumulate_biases) + { + NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY); + } + } +} + +void NEFullyConnectedLayerEx::prepare() +{ +#if 0 // TODO Remove this block + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + auto release_unused = [](Tensor *w) { + if (!w->is_used()) + { + w->allocator()->free(); + } + }; + + // Pointer to current weights + const ITensor *cur_weights = _original_weights; + + // Reshape of the weights (happens only once) + if (!_are_weights_reshaped) + { + // Run reshape weights kernel and mark weights as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_function.run(); + + cur_weights->mark_as_unused(); + cur_weights = &_reshape_weights_output; + _are_weights_reshaped = true; + } + + // Convert weights if needed (happens only once) + if (!_are_weights_converted) + { + _converted_weights_output.allocator()->allocate(); + _convert_weights.run(); + + cur_weights->mark_as_unused(); + _are_weights_converted = true; + } + + // Release reshaped weights if unused + release_unused(&_reshape_weights_output); + + // Prepare GEMM prepare and release unused weights + if (!_is_quantized) + { + _mm_gemm.prepare(); + } + + // Release converted weights if unused + release_unused(&_reshape_weights_output); + release_unused(&_converted_weights_output); + + _is_prepared = true; + } +#endif +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp new file mode 100644 index 000000000..fcac3c7ae --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h" + +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h> +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h> +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h> + +using namespace arm_compute; + +void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input, + const arm_compute::ITensor *weights, + const arm_compute::ITensor *biases, + arm_compute::ITensor *output, bool needs_reshape, + const arm_compute::TensorShape &reshape, + KernelType kernel_type) +{ + _input = input; + _weights = weights; + _biases = biases; + _output = output; + _needs_reshape = needs_reshape; + + const ITensor *input_to_use = input; + if (_needs_reshape) + { + // reshape + auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape)); + _neon_reshape.configure(_input, &_neon_buffer); + input_to_use = &_neon_buffer; + } + + _neon_fc = [&]() { + if (kernel_type == KernelType::GENERAL) + { + auto fc = new arm_compute::NEFullyConnectedLayerEx{_memory_manager}; + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + else + { + assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS); + + bool is_hybrid = input->info()->data_type() == DataType::F32 && + weights->info()->data_type() == DataType::S8; + + if (is_hybrid) + { + auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager}; + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + else + { + auto fc = new arm_compute::NEFullyConnectedLayer{_memory_manager}; + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + } + }(); + + // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate here. + if (_needs_reshape) + { + _neon_buffer.allocator()->allocate(); + } +} + +void NEFullyConnectedReshapingLayer::run(void) +{ + if (_needs_reshape) + _neon_reshape.run(); + + _neon_fc->run(); +} + +void NEFullyConnectedReshapingLayer::prepare(void) { _neon_fc->prepare(); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp new file mode 100644 index 000000000..11794a1ea --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp @@ -0,0 +1,503 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "arm_compute/runtime/TensorAllocator.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +NEGEMMLowpMatrixMultiplyCoreEx::NEGEMMLowpMatrixMultiplyCoreEx( + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), + _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), + _mtx_b_reduction_kernel(), _offset_contribution_kernel(), + _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), + _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), + _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false), + _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), + _fuse_output_stage(false), _run_activation(false), _flip_signedness(false) +{ +} + +void NEGEMMLowpMatrixMultiplyCoreEx::configure(const ITensor *a, const ITensor *b, const ITensor *c, + ITensor *output, const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); + ARM_COMPUTE_UNUSED(c); + ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCoreEx::validate( + a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info)); + + const ITensor *matrix_a = a; + const ITensor *matrix_b = b; + GEMMInfo info = gemm_info; + + // Clear state + _mtx_a_reshape_kernel = nullptr; + _mtx_b_reshape_kernel = nullptr; + + // Set internal variables + _a_offset = a->info()->quantization_info().offset; + _b_offset = b->info()->quantization_info().offset; + _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; + _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run(); + _is_prepared = false; + _fused_assembly_path = false; + _original_b = b; + + const ITensor *a_to_use = a; + + // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage + if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + { + _fuse_output_stage = true; + _memory_group.manage(&_mm_result_s32); + TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32); + _mm_result_s32.allocator()->init(info_mm_result_s32); + } + +#ifdef __aarch64__ +#if 0 // Can use after arm compute library v19.11 + switch (a->info()->data_type()) + { + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::U8: + case DataType::S8: + { + if (a_to_use->info()->data_type() == DataType::QASYMM8 && + info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + _asm_glue.configure(a_to_use, b, c, output, gemm_info); + _fused_assembly_path = _asm_glue.is_configured(); + } + else + { + _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, + gemm_info); + } + _assembly_path = _asm_glue.is_configured(); + break; + } + default: + { + ARM_COMPUTE_ERROR("Datatype not supported"); + break; + } + } +#endif // 0 + ARM_COMPUTE_ERROR("aarch64 not supported"); +#endif /* __aarch64__ */ + if (!(_assembly_path || _run_vector_matrix_multiplication)) + { + matrix_a = &_tmp_a; + matrix_b = &_tmp_b; + + // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / + // 4.0f) ] + TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, + a_to_use->info()->data_type(), a_to_use->info()->quantization_info()); + // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / + // 16.0f) ] + TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), + b->info()->quantization_info()); + _tmp_a.allocator()->init(a_info); + _tmp_b.allocator()->init(b_info); + _memory_group.manage(&_tmp_a); + if (!_reshape_b_only_on_first_run) + { + _memory_group.manage(&_tmp_b); + } + + // Configure interleave kernel + { + auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>(); + k->configure(a_to_use, &_tmp_a); + _mtx_a_reshape_kernel = std::move(k); + } + + // Configure transpose kernel + { + auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>(); + k->configure(b, &_tmp_b); + _mtx_b_reshape_kernel = std::move(k); + } + } + + if (!_fused_assembly_path) + { + // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 + if (_a_offset != 0) + { + TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32); + + _vector_sum_col.allocator()->init(info_vector_sum_col); + if (!_reshape_b_only_on_first_run) + { + _memory_group.manage(&_vector_sum_col); + } + + // Configure Matrix B reduction kernel + _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false); + } + + // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 + if (_b_offset != 0) + { + TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32); + + _vector_sum_row.allocator()->init(info_vector_sum_row); + _memory_group.manage(&_vector_sum_row); + + // Configure matrix A reduction kernel + _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0), + false); + } + + if (_fuse_output_stage) + { + // Configure matrix multiply kernel + if (!_assembly_path) + { + auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>(); + k->configure(matrix_a, matrix_b, &_mm_result_s32); + _mm_kernel = std::move(k); + } + + _offset_contribution_output_stage_kernel.configure( + &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c, + _flip_signedness ? &_signed_output : output, a->info()->dimension(0), _a_offset, + _b_offset, info.gemmlowp_output_stage()); + } + else + { + // Configure matrix multiply kernel + if (!_assembly_path) + { + auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>(); + k->configure(matrix_a, matrix_b, output); + _mm_kernel = std::move(k); + } + // Configure offset contribution kernel + _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, + a_to_use->info()->dimension(0), _a_offset, _b_offset); + } + } + + // Allocate tensors + if (!_assembly_path && !_run_vector_matrix_multiplication) + { + _tmp_a.allocator()->allocate(); + if (!_reshape_b_only_on_first_run) + { + _tmp_b.allocator()->allocate(); + } + } + + if (!_fused_assembly_path) + { + if (_a_offset != 0 && !_reshape_b_only_on_first_run) + { + _vector_sum_col.allocator()->allocate(); + } + + if (_b_offset != 0) + { + _vector_sum_row.allocator()->allocate(); + } + } + + if (_fuse_output_stage) + { + _mm_result_s32.allocator()->allocate(); + } +} + +Status NEGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b, + const ITensorInfo *c, const ITensorInfo *output, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, + "Bias addition not supported in NEGEMMLowpMatrixMultiplyCoreEx for output S32"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1), + "The product AB is defined only if the number of columns in A is " + "equal to the number of rows in B"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), + "Matrix A already reshaped is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), + "Matrix B already reshaped is not supported"); + + GEMMInfo info = gemm_info; + const ITensorInfo *matrix_a_info = a; + const ITensorInfo *matrix_b_info = b; + + const ITensorInfo *a_to_use = a; + + TensorInfo tmp_a_info{}; + TensorInfo tmp_b_info{}; + TensorInfo mm_result_s32_info{}; + + int32_t a_offset = a->quantization_info().offset; + int32_t b_offset = b->quantization_info().offset; + + bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE; + if (fuse_output_stage) + { + auto_init_if_empty( + mm_result_s32_info, + a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); + } + + // Check if we need to run the optimized assembly kernel + bool run_optimised = false; + bool run_optimised_requantized = false; + const bool reshape_b_only_on_first_run = info.reshape_b_only_on_first_run(); + if (a_to_use->data_type() == DataType::QASYMM8 && + info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, output, 1.f, 0.f, + reshape_b_only_on_first_run)); + run_optimised_requantized = run_optimised; + } + else + { + run_optimised = bool(NEGEMMAssemblyDispatch::validate( + a_to_use, b, fuse_output_stage ? &mm_result_s32_info : output, 1.f, 0.f, + reshape_b_only_on_first_run)); + } + + if (run_optimised) + { + ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0)); + if (info.depth_output_gemm3d() != 0) + { + if (info.reinterpret_input_as_3d()) + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), + "NEGEMM cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, + "NEGEMM cannot reinterpret the output tensor as 3D"); + + const bool run_vector_matrix_multiplication = a->dimension(1) < 2; + if (!run_vector_matrix_multiplication) + { + matrix_a_info = &tmp_a_info; + matrix_b_info = &tmp_b_info; + + // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / + // 4.0f) ] + TensorShape shape_tmp_a = a->tensor_shape(); + shape_tmp_a.set(0, a->dimension(0) * 4); + shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f)); + + // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width + // / 16.0f) ] + TensorShape shape_tmp_b = b->tensor_shape(); + shape_tmp_b.set(0, b->dimension(1) * 16); + shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); + + // Validate interleave kernel + auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a)); + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info)); + } + } + + if (!run_optimised_requantized) + { + TensorInfo info_vector_sum_col{}; + TensorInfo info_vector_sum_row{}; + + // Validate matrix B reduction kernel only if _a_offset is not equal to 0 + if (a_offset != 0) + { + info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); + + // Configure Matrix B reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate( + b, &info_vector_sum_col, a->dimension(0), false)); + } + + // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 + if (b_offset != 0) + { + info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); + + // Configure matrix A reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate( + a_to_use, &info_vector_sum_row, a->dimension(0), false)); + } + + if (fuse_output_stage) + { + if (!run_optimised) + { + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate( + matrix_a_info, matrix_b_info, &mm_result_s32_info)); + } + + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate( + &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset, + info.gemmlowp_output_stage())); + } + else + { + if (!run_optimised) + { + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output)); + } + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate( + output, a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, a_offset, b_offset)); + } + } + return Status{}; +} + +void NEGEMMLowpMatrixMultiplyCoreEx::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + // Reshape inputs + if (_mtx_a_reshape_kernel) + { + NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY); + } + if (_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run) + { + NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); + } + + // Run GEMM + if (_asm_glue.is_configured()) + { + _asm_glue.run(); + } + else + { + NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY); + } + + if (!_fused_assembly_path) + { + // Run matrix A reduction kernel only if _b_offset is not equal to 0 + if (_b_offset != 0) + { + NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX); + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if (_a_offset != 0 && !_reshape_b_only_on_first_run) + { + NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); + } + + if (_fuse_output_stage) + { + // Run offset contribution kernel + NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY); + } + else + { + // Run offset contribution kernel + NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY); + } + } +} + +void NEGEMMLowpMatrixMultiplyCoreEx::prepare() +{ + if (!_is_prepared) + { + // Run assembly reshape + if (_asm_glue.is_configured() && _reshape_b_only_on_first_run) + { + ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + + _asm_glue.prepare(); + _original_b->mark_as_unused(); + } + // Run non-assembly reshape + else if (_mtx_b_reshape_kernel && _reshape_b_only_on_first_run) + { + ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + + // Run reshape kernel and mark original weights tensor as unused + _tmp_b.allocator()->allocate(); + NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); + _original_b->mark_as_unused(); + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if (_a_offset != 0 && _reshape_b_only_on_first_run) + { + _vector_sum_col.allocator()->allocate(); + NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); + } + + _is_prepared = true; + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp new file mode 100644 index 000000000..90dabb35a --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEGatherEx.h" + +#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h" +#include "support/ToolchainSupport.h" + +#include <utility> + +namespace arm_compute +{ +void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis) +{ + auto k = arm_compute::support::cpp14::make_unique<NEGatherKernelEx>(); + k->configure(input, indices, output, axis); + _kernel = std::move(k); +} + +Status NEGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + return NEGatherKernelEx::validate(input, indices, output, axis); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp new file mode 100644 index 000000000..624185d2c --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h" + +#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, + ITensor *output, ITensor *hits) +{ + auto k = arm_compute::support::cpp14::make_unique<NEHashtableLookupKernel>(); + k->configure(lookups, keys, input, output, hits); + _kernel = std::move(k); +} + +Status NEHashtableLookup::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + return NEHashtableLookupKernel::validate(lookups, keys, input, output, hits); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp new file mode 100644 index 000000000..1c2c8f027 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +NEInstanceNormalizationLayerEx::NEInstanceNormalizationLayerEx( + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), + _permute_input(), _permute_output(), _permuted_input(), _permuted_output() +{ +} + +void NEInstanceNormalizationLayerEx::configure(ITensor *input, ITensor *output, ITensor *gamma, + ITensor *beta, float epsilon) +{ + const DataLayout data_layout = input->info()->data_layout(); + + // Configure Kernels + _is_nchw = data_layout == DataLayout::NCHW; + + if (!_is_nchw) + { + _memory_group.manage(&_permuted_input); + _memory_group.manage(&_permuted_output); + + // Configure the function to transform the input tensor from NHWC -> NCHW + _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U)); + _permuted_input.info()->set_data_layout(DataLayout::NCHW); + + _normalization_kernel.configure(&_permuted_input, &_permuted_output, gamma, beta, epsilon); + _permuted_output.info()->set_data_layout(DataLayout::NCHW); + + _permute_output.configure(&_permuted_output, output != nullptr ? output : input, + PermutationVector(2U, 0U, 1U)); + _permuted_input.allocator()->allocate(); + _permuted_output.allocator()->allocate(); + } + else + { + _normalization_kernel.configure(input, output, gamma, beta, epsilon); + } +} + +Status NEInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, + float epsilon) +{ + return NEInstanceNormalizationLayerKernelEx::validate( + &input->clone()->set_data_layout(DataLayout::NCHW), + &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon); +} + +void NEInstanceNormalizationLayerEx::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + // Permute input + if (!_is_nchw) + { + _permute_input.run(); + } + + NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ); + + // Permute output + if (!_is_nchw) + { + _permute_output.run(); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp new file mode 100644 index 000000000..1150cef76 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEPReLU.h" + +#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h" +#include "support/ToolchainSupport.h" + +#include <utility> + +using namespace arm_compute; + +void NEPReLU::configure(const ITensor *input, const ITensor *alpha, ITensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<NEPReLUKernel>(); + k->configure(input, alpha, output); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp new file mode 100644 index 000000000..84411c266 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NERNNLayerEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +NERNNLayerEx::NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), + _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), + _gemm_output(), _add_output(), _is_prepared(false) +{ +} + +Status NERNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, const ITensorInfo *bias, + const ITensorInfo *hidden_state, const ITensorInfo *output, + const ActivationLayerInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, + output); + + const int idx_width = 0; + const int idx_height = 1; + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != + recurrent_weights->dimension(idx_width)); + ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != + recurrent_weights->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), + hidden_state->tensor_shape()); + + auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape( + recurrent_weights, hidden_state->dimension(idx_height)), + 1, input->data_type()); + + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate( + &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info)); + + return Status{}; +} + +void NERNNLayerEx::configure(const ITensor *input, const ITensor *weights, + const ITensor *recurrent_weights, const ITensor *bias, + ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); + ARM_COMPUTE_ERROR_THROW_ON(NERNNLayerEx::validate(input->info(), weights->info(), + recurrent_weights->info(), bias->info(), + hidden_state->info(), output->info(), info)); + + const int idx_height = 1; + TensorShape shape = misc::shape_calculator::compute_rnn_shape( + recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); + + _is_prepared = false; + + // Manage intermediate buffers and configure + _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + + // Manage intermediate buffers and configure + _memory_group.manage(&_fully_connected_out); + _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out); + + _memory_group.manage(&_gemm_output); + _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f); + + _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + _memory_group.manage(&_add_output); + + _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, + ConvertPolicy::SATURATE); + + _fully_connected_out.allocator()->allocate(); + _gemm_output.allocator()->allocate(); + + _activation_kernel.configure(&_add_output, hidden_state, info); + _add_output.allocator()->allocate(); + + _copy_kernel.configure(hidden_state, output); +} + +void NERNNLayerEx::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + _fully_connected_kernel.run(); + + _gemm_state_f.run(); + + NEScheduler::get().schedule(&_add_kernel, Window::DimY); + NEScheduler::get().schedule(&_activation_kernel, Window::DimY); + + // copy hidden out to output + NEScheduler::get().schedule(&_copy_kernel, Window::DimY); +} + +void NERNNLayerEx::prepare() +{ + if (!_is_prepared) + { + _fully_connected_kernel.prepare(); + _gemm_state_f.prepare(); + + _is_prepared = true; + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp new file mode 100644 index 000000000..c65e93570 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEReduceMeanEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute; + +NEReduceMeanEx::NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() +{ +} + +Status NEReduceMeanEx::validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(keep_dims); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); + + TensorShape out_shape = input->tensor_shape(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + const int input_dims = input->num_dimensions(); + Coordinates axis_local = reduction_axis; + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); + for (unsigned int i = 0; i < reduction_ops; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > + input->num_dimensions() - 1); + if (output->total_size() > 0 && keep_dims) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); + } + if (keep_dims) + { + out_shape.set(axis_local[i], 1); + } + else + { + out_shape.remove_dimension(axis_local[i] - i); + } + } + const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + + return Status{}; +} + +void NEReduceMeanEx::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _reduction_ops = reduction_axis.num_dimensions(); + _reduction_kernels = + arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops); + _reduced_outs = + arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0)); + _keep_dims = keep_dims; + + Coordinates axis_local = reduction_axis; + const int input_dims = input->info()->num_dimensions(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + // Perform reduction for every axis + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + TensorShape out_shape = i == 0 ? input->info()->tensor_shape() + : (_reduced_outs.get() + i - 1)->info()->tensor_shape(); + out_shape.set(axis_local[i], 1); + auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1); + + if (i == _reduction_ops - 1 && keep_dims) + { + _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM); + } + else + { + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), + input->info()->data_type(), + input->info()->quantization_info()) + .set_data_layout(output->info()->data_layout())); + _memory_group.manage(_reduced_outs.get() + i); + _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], + ReductionOperation::MEAN_SUM); + } + } + + // Allocate intermediate tensors + for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + { + _reduced_outs[i].allocator()->allocate(); + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + TensorShape out_shape = input->info()->tensor_shape(); + + // We have to sort the reduction axis vectors in order for remove_dimension + // to work properly + std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + out_shape.remove_dimension(axis_local[i] - i); + } + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output); + } +} + +void NEReduceMeanEx::run() +{ + _memory_group.acquire(); + + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + _reduction_kernels[i].run(); + } + + if (!_keep_dims) + { + _reshape.run(); + } + _memory_group.release(); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp new file mode 100644 index 000000000..b36f8287a --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEReduceOperation.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute; + +NEReduceOperation::NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() +{ +} + +Status NEReduceOperation::validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output, ReduceOperation op) +{ + ARM_COMPUTE_UNUSED(keep_dims); + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); + + TensorShape out_shape = input->tensor_shape(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + const int input_dims = input->num_dimensions(); + Coordinates axis_local = reduction_axis; + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); + for (unsigned int i = 0; i < reduction_ops; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > + input->num_dimensions() - 1); + if (output->total_size() > 0 && keep_dims) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); + } + if (keep_dims) + { + out_shape.set(axis_local[i], 1); + } + else + { + out_shape.remove_dimension(axis_local[i] - i); + } + } + const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + + return Status{}; +} + +void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _reduction_ops = reduction_axis.num_dimensions(); + _reduction_kernels.resize(_reduction_ops); + _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); + _keep_dims = keep_dims; + + Coordinates axis_local = reduction_axis; + const int input_dims = input->info()->num_dimensions(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + // Perform reduction for every axis + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + TensorShape out_shape = + i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + out_shape.set(axis_local[i], 1); + auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); + + if (i == _reduction_ops - 1 && keep_dims) + { + _reduction_kernels[i].configure(in, output, axis_local[i], op); + } + else + { + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), + input->info()->data_type(), + input->info()->quantization_info())); + _memory_group.manage(&_reduced_outs[i]); + _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], op); + } + } + + // Allocate intermediate tensors + for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + { + _reduced_outs[i].allocator()->allocate(); + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + TensorShape out_shape = input->info()->tensor_shape(); + + // We have to sort the reduction axis vectors in order for remove_dimension + // to work properly + std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + out_shape.remove_dimension(axis_local[i] - i); + } + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(&_reduced_outs[_reduction_ops - 1], output); + } +} + +void NEReduceOperation::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + _reduction_kernels[i].run(); + } + + if (!_keep_dims) + { + _reshape.run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp new file mode 100644 index 000000000..3c18217ef --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEReduceSum.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute; + +NEReduceSum::NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() +{ +} + +Status NEReduceSum::validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(keep_dims); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); + + TensorShape out_shape = input->tensor_shape(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + const int input_dims = input->num_dimensions(); + Coordinates axis_local = reduction_axis; + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); + for (unsigned int i = 0; i < reduction_ops; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > + input->num_dimensions() - 1); + if (output->total_size() > 0 && keep_dims) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); + } + if (keep_dims) + { + out_shape.set(axis_local[i], 1); + } + else + { + out_shape.remove_dimension(axis_local[i] - i); + } + } + const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + + return Status{}; +} + +void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _reduction_ops = reduction_axis.num_dimensions(); + _reduction_kernels.resize(_reduction_ops); + _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); + _keep_dims = keep_dims; + + Coordinates axis_local = reduction_axis; + const int input_dims = input->info()->num_dimensions(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + // Perform reduction for every axis + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + TensorShape out_shape = + i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + out_shape.set(axis_local[i], 1); + auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); + + if (i == _reduction_ops - 1 && keep_dims) + { + _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::SUM); + } + else + { + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), + input->info()->data_type(), + input->info()->quantization_info()) + .set_data_layout(input->info()->data_layout())); + _memory_group.manage(&_reduced_outs[i]); + _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], + ReductionOperation::SUM); + } + } + + // Allocate intermediate tensors + for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + { + _reduced_outs[i].allocator()->allocate(); + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + TensorShape out_shape = input->info()->tensor_shape(); + + // We have to sort the reduction axis vectors in order for remove_dimension + // to work properly + std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + out_shape.remove_dimension(axis_local[i] - i); + } + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(&_reduced_outs[_reduction_ops - 1], output); + } +} + +void NEReduceSum::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + _reduction_kernels[i].run(); + } + + if (!_keep_dims) + { + _reshape.run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp new file mode 100644 index 000000000..c3431c418 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +namespace +{ +/** Define dimension to split the window + * + * @param[in] axis Reduction axis + * + * @return The dimension to split the window + */ +size_t reduction_window_split_dimension(unsigned int axis) +{ + switch (axis) + { + case 0: + return Window::DimY; + case 1: + case 2: + case 3: + return Window::DimX; + default: + ARM_COMPUTE_ERROR("Unsupported reduction axis"); + } +} +} // namespace + +NEReductionOperationEx::NEReductionOperationEx() + : _reduction_kernel(), _fill_border_kernel(), _window_split(0), _reduction_axis() +{ +} + +Status NEReductionOperationEx::validate(const ITensorInfo *input, const ITensorInfo *output, + unsigned int axis, ReduceOperation op) +{ + ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernelEx::validate(input, output, axis, op)); + + return Status{}; +} + +void NEReductionOperationEx::configure(ITensor *input, ITensor *output, unsigned int axis, + ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + NEReductionOperationEx::validate(input->info(), output->info(), axis, op)); + + // Configure reduction kernel + _reduction_kernel.configure(input, output, axis, op); + _window_split = reduction_window_split_dimension(axis); + _reduction_axis = axis; + + if (axis == 0) + { + // Configure fill border kernel + const BorderSize fill_border_size = _reduction_kernel.border_size(); + PixelValue pixelValue; + switch (op) + { + case ReduceOperation::MIN: + { + switch (input->info()->data_type()) + { + case DataType::F32: + { + pixelValue = PixelValue(std::numeric_limits<float>::max()); + break; + } + case DataType::F16: + { + pixelValue = PixelValue(static_cast<half>(65504.0f)); + break; + } + case DataType::QASYMM8: + { + pixelValue = + PixelValue(255, input->info()->data_type(), input->info()->quantization_info()); + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported DataType"); + } + } + break; + } + case ReduceOperation::MAX: + { + switch (input->info()->data_type()) + { + case DataType::F32: + { + pixelValue = PixelValue(-std::numeric_limits<float>::max()); + break; + } + case DataType::F16: + { + pixelValue = PixelValue(static_cast<half>(-65504.0f)); + break; + } + case DataType::QASYMM8: + { + pixelValue = + PixelValue(0, input->info()->data_type(), input->info()->quantization_info()); + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported DataType"); + } + } + break; + } + default: + ARM_COMPUTE_ERROR("Reduction Operation unsupported"); + } + _fill_border_kernel.configure(input, fill_border_size, BorderMode::CONSTANT, pixelValue); + } +} + +void NEReductionOperationEx::run() +{ + if (_reduction_axis == 0) + { + NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY); + } + NEScheduler::get().schedule(&_reduction_kernel, _window_split); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp new file mode 100644 index 000000000..c9f914fb0 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +NESpaceToBatchLayerEx::NESpaceToBatchLayerEx() + : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false) +{ +} + +void NESpaceToBatchLayerEx::configure(const ITensor *input, const ITensor *block_shape, + const ITensor *paddings, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); + + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + { + _has_padding = true; + _memset_kernel.configure( + output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info())); + } + _space_to_batch_kernel.configure(input, block_shape, paddings, output); +} + +void NESpaceToBatchLayerEx::configure(const ITensor *input, const int block_shape_x, + const int block_shape_y, const Size2D &padding_left, + const Size2D &padding_right, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + { + _has_padding = true; + _memset_kernel.configure( + output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info())); + } + _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, + output); +} + +Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const ITensorInfo *block_shape, + const ITensorInfo *paddings, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR( + NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output)); + + return Status{}; +} + +Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const int block_shape_x, + const int block_shape_y, const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate( + input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + + return Status{}; +} + +void NESpaceToBatchLayerEx::run() +{ + // Zero out output only if we have paddings + if (_has_padding) + { + NEScheduler::get().schedule(&_memset_kernel, Window::DimY); + } + NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp new file mode 100644 index 000000000..b6ae21cc0 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +namespace arm_compute +{ +void NESpaceToDepthLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape) +{ + auto k = arm_compute::support::cpp14::make_unique<NESpaceToDepthLayerKernelEx>(); + k->configure(input, output, block_shape); + _kernel = std::move(k); +} + +Status NESpaceToDepthLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + int32_t block_shape) +{ + ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToDepthLayerKernelEx::validate(input, output, block_shape)); + return Status{}; +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp new file mode 100644 index 000000000..fd15ef05f --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/UtilsEx.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute::misc::shape_calculator; + +namespace arm_compute +{ +NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _conv_f(), + _upsample_f(), + _flip_weights(), + _permute_input(), + _permute_weights(), + _permute_output(), + _scaled_output(), + _weights_flipped(), + _permuted_input(), + _permuted_weights(), + _permuted_output(), + _is_nchw(false), + _original_weights(nullptr), + _input(nullptr), + _info(), + _is_prepared(false) +{ +} + +Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, const ITensorInfo *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input); + const unsigned int width_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const unsigned int height_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1); + + auto out_dims = transposeconv_output_dimensions( + input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), + weights->dimension(height_idx), info, invalid_right, invalid_bottom); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + if (is_data_type_quantized_asymmetric(input->data_type()) && bias) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + } + else if (bias) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + } + + if (output->tensor_shape().total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) < output_shape.x(), + "Output's dim 0 is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) < output_shape.y(), + "Output's dim 1 is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) < output_shape.z(), + "Output's dim 2 is invalid."); + } + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, + pad_bottom); + TensorInfo scale_out_info( + input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); + scale_out_info.set_data_layout(input->data_layout()); + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + const unsigned int batches_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); + const unsigned int channel_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != + scale_out_info.dimension(batches_idx)); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != + scale_out_info.dimension(channel_idx)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, + conv_info, WeightsInfo())); + + return Status{}; +} + +void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, + ITensor *output, const PadStrideInfo &info, + unsigned int invalid_right, unsigned int invalid_bottom) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + const DataLayout data_layout = input->info()->data_layout(); + + _input = input; + _original_weights = weights; + _info = info; + _is_prepared = false; + _is_nchw = data_layout == DataLayout::NCHW; + + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + + const unsigned int width_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + auto out_dims = transposeconv_output_dimensions( + input->info()->dimension(width_idx), input->info()->dimension(height_idx), + weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info, + invalid_right, invalid_bottom); + + const TensorShape output_shape = + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + // Output auto initialization if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate( + input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), + info, invalid_right, invalid_bottom)); + + _memory_group.manage(&_scaled_output); + + if (!_is_nchw) + { + _memory_group.manage(&_permuted_input); + _memory_group.manage(&_permuted_weights); + _memory_group.manage(&_permuted_output); + + // Configure the function to transform the input tensor from NHWC -> NCHW + _permuted_input.info()->set_quantization_info(input->info()->quantization_info()); + _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U)); + _permuted_input.info()->set_data_layout(DataLayout::NCHW); + + // Configure the function to transform the weights tensor from NHWC -> NCHW + _permuted_weights.info()->set_quantization_info(weights->info()->quantization_info()); + _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U)); + _permuted_weights.info()->set_data_layout(DataLayout::NCHW); + + // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in + // order to match output shape + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *_permuted_input.info(), *_permuted_weights.info(), info, out_dims, invalid_right, + invalid_bottom, pad_left, pad_right, pad_top, pad_bottom); + + TensorInfo scale_out_info(scale_out_shape, 1, _permuted_input.info()->data_type(), + _permuted_input.info()->quantization_info()); + scale_out_info.set_data_layout(DataLayout::NCHW); + _scaled_output.allocator()->init(scale_out_info); + + const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, + DimensionRoundingType::CEIL); + _upsample_f.configure(&_permuted_input, &_scaled_output, upsample_info); + + _weights_flipped.allocator()->init(*_permuted_weights.info()->clone()); + _weights_flipped.info()->set_quantization_info(weights->info()->quantization_info()); + _flip_weights.configure(&_permuted_weights, &_weights_flipped); + + // setup the function to convolve the upscaled output + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + const auto out_shape = output->info()->tensor_shape(); + TensorShape permuted_out_shape{out_shape[1], out_shape[2], out_shape[0], out_shape[3]}; + TensorInfo permuted_out_info(permuted_out_shape, 1, output->info()->data_type(), + output->info()->quantization_info()); + _permuted_output.allocator()->init(permuted_out_info); + _permuted_output.info()->set_data_layout(DataLayout::NCHW); + _conv_f.configure(&_scaled_output, &_weights_flipped, bias, &_permuted_output, conv_info); + + // Configure the function to transform the convoluted output to NHWC + _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U)); + + _permuted_input.allocator()->allocate(); + _permuted_weights.allocator()->allocate(); + _permuted_output.allocator()->allocate(); + } + else + { + // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in + // order to match output shape + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); + + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + _scaled_output.allocator()->init(scale_out_info); + const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, + DimensionRoundingType::FLOOR); + _upsample_f.configure(input, &_scaled_output, upsample_info); + + _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); + _flip_weights.configure(weights, &_weights_flipped); + + // setup the function to convolve the upscaled output + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info); + } + _scaled_output.allocator()->allocate(); +} + +void NETransposeConvLayer::run() +{ + prepare(); + + // MemoryGroupResourceScope scope_mg(_memory_group); + + // Permute input + if (!_is_nchw) + { + _permute_input.run(); + } + + _upsample_f.run(); + _conv_f.run(); + + // Permute output + if (!_is_nchw) + { + _permute_output.run(); + } +} + +void NETransposeConvLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights flipping and mark original weights tensor as unused + _weights_flipped.allocator()->allocate(); + // Permute weights + if (!_is_nchw) + { + _permute_weights.run(); + } + NEScheduler::get().schedule(&_flip_weights, Window::DimZ); + _original_weights->mark_as_unused(); + + // Prepare convolution + _conv_f.prepare(); + + if (!_weights_flipped.is_used()) + { + _weights_flipped.allocator()->free(); + } + + _is_prepared = true; + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp new file mode 100644 index 000000000..67e1bfb02 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/misc/functions/GenericGather.h" + +namespace arm_compute +{ +namespace misc +{ + +bool shouldPermute(arm_compute::ITensorInfo *input, arm_compute::ITensorInfo *output) +{ + return (input->num_dimensions() != 4 && output->num_dimensions() == 4 && + input->data_layout() == DataLayout::NCHW); +} + +void GenericGather::configure(arm_compute::ITensor *input, arm_compute::ITensor *indices, + arm_compute::ITensor *output, int axis) +{ + _input = input; + _indices = indices; + _output = output; + _axis = axis; + + arm_compute::PermutationVector pv; + if (shouldPermute(input->info(), output->info())) + { + // NOTE This vector comes from CLPermuteKernel implementation + // + // This implementation permutes a tensor of shape C / W / H into another tensor of shape W / H / + // C + // + // Original | Permuted + // 0 | C | W (from 1) + // 1 | W | H (from 2) + // 2 | H | C (from 0) + // + pv = arm_compute::PermutationVector{1, 2, 0}; + } + + if (utils::isGpuMode()) + { + if (shouldPermute(input->info(), output->info())) + { + _cl_gather.configure(CAST_CL(input), CAST_CL(indices), &_cl_permuted, axis); + _cl_permute.configure(&_cl_permuted, CAST_CL(output), pv); + + // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here. + _cl_permuted.allocator()->allocate(); + } + else + { + _cl_gather.configure(CAST_CL(input), CAST_CL(indices), CAST_CL(output), axis); + } + } + else + { + throw std::runtime_error("Not supported, yet"); + } +} + +void GenericGather::run(void) +{ + if (utils::isGpuMode()) + { + _cl_gather.run(); + if (shouldPermute(_input->info(), _output->info())) + { + _cl_permute.run(); + } + } + else + { + throw std::runtime_error("Not supported, yet"); + } +} + +} // namespace misc +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp new file mode 100644 index 000000000..8025ae28e --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/runtime/misc/functions/GenericReshapeLayer.h" + +namespace arm_compute +{ +namespace misc +{ + +namespace +{ + +bool shouldPermute(const arm_compute::ITensorInfo *input, arm_compute::ITensorInfo *output) +{ + return (input->num_dimensions() == 4 || output->num_dimensions() == 4) && + (input->num_dimensions() != output->num_dimensions() && + input->data_layout() == DataLayout::NCHW); +} + +} // namespace + +void GenericReshapeLayer::configure(const arm_compute::ITensor *input, arm_compute::ITensor *output) +{ + _input = input; + _output = output; + + arm_compute::PermutationVector pv; + if (input->info()->data_layout() == DataLayout::NCHW && input->info()->num_dimensions() == 4 && + output->info()->num_dimensions() != 4) + { + // NOTE This vector comes from CLPermuteKernel implementation + // + // This implementation permutes a tensor of shape W / H / C into another tensor of shape + // C / W / H + // + // Original | Permuted + // 0 | W | C (from 2) + // 1 | H | W (from 0) + // 2 | C | H (from 1) + // + pv = arm_compute::PermutationVector{2, 0, 1}; + } + else if (input->info()->data_layout() == DataLayout::NCHW && + input->info()->num_dimensions() != 4 && output->info()->num_dimensions() == 4) + { + // NOTE This vector comes from CLPermuteKernel implementation + // + // This implementation permutes a tensor of shape C / W / H into another tensor of shape + // W / H / C + // + // Original | Permuted + // 0 | C | W (from 1) + // 1 | W | H (from 2) + // 2 | H | C (from 0) + // + pv = arm_compute::PermutationVector{1, 2, 0}; + } + + if (utils::isGpuMode()) + { + const auto const_input = CAST_CL(const_cast<arm_compute::ITensor *>(input)); + if (shouldPermute(input->info(), output->info())) + { + _cl_permute.configure(const_input, &_cl_permuted, pv); + _cl_reshape.configure(&_cl_permuted, CAST_CL(output)); + + // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here. + _cl_permuted.allocator()->allocate(); + } + else + { + _cl_reshape.configure(const_input, CAST_CL(output)); + } + } + else + { + if (shouldPermute(input->info(), output->info())) + { + _neon_permute.configure(input, &_neon_permuted, pv); + _neon_reshape.configure(&_neon_permuted, output); + + // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here. + _neon_permuted.allocator()->allocate(); + } + else + { + _neon_reshape.configure(input, output); + } + } +} + +void GenericReshapeLayer::run(void) +{ + if (utils::isGpuMode()) + { + if (shouldPermute(_input->info(), _output->info())) + { + _cl_permute.run(); + } + _cl_reshape.run(); + } + else + { + if (shouldPermute(_input->info(), _output->info())) + { + _neon_permute.run(); + } + _neon_reshape.run(); + } +} + +} // namespace misc +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp new file mode 100644 index 000000000..44a4bb9ed --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/misc/functions/Utils.h" + +namespace arm_compute +{ +namespace misc +{ +namespace utils +{ + +bool isGpuMode() +{ + char *neon = std::getenv("NEON"); + if (neon == nullptr) + return true; + else if (neon[0] == '1') + return false; + return true; +} + +} // namespace utils +} // namespace misc +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/topk_v2.h b/compute/ARMComputeEx/src/runtime/topk_v2.h new file mode 100644 index 000000000..f94effea1 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/topk_v2.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file topk_v2.h + * @brief This file contains TopK method and TopContainer class for TopK operation + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ +#define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ + +typedef int32_t int32; + +namespace nnfw +{ +namespace rt +{ +namespace optimized_ops +{ +/** + * @brief class to define TopK operation + * @note The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file. + * TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than + * TFLite. + * (TFLite additionaly supports kTfLiteInt64.) + * + * The class that collects top indexes of k values. Based on template + * tensorflow::gtl::TopN<> but, for optimization, + * it re-uses the same container. + */ +template <typename T> class TopContainer +{ +public: + /** + * @brief Prevent default constructor of of this class + */ + TopContainer() = delete; + /** + * @brief Constructor with params + * @param [in] row_size Size of row in data + * @param [in] k The top k predictions + */ + TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr) + { + container_.reserve(std::min(k, row_size) + 1); + } + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * @param [in] topContainer To copy + */ + TopContainer(const TopContainer &) = delete; + /* + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * @param [in] topContainer To copy + * @return Reference of TopContainer + */ + TopContainer &operator=(const TopContainer &) = delete; + + /** + * @brief Start collecting + * @param [in] values To set as values + * @return N/A + */ + void start_collecting(const T *values) + { + values_ = values; + container_.clear(); + } + + /** + * @brief Push a value to be compared for topk + * @param [in] a A value to compare + * @return N/A + */ + void push(int32 a) + { + auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; + if (container_.size() <= (size_t)k_) + { + container_.push_back(a); + if (container_.size() == (size_t)(k_ + 1)) + { + std::make_heap(container_.begin(), container_.end(), comparator); + std::pop_heap(container_.begin(), container_.end(), comparator); + } + } + else if (comparator(a, container_.front())) + { + container_.back() = a; + std::push_heap(container_.begin(), container_.end(), comparator); + std::pop_heap(container_.begin(), container_.end(), comparator); + } + } + + /** + * @brief Get sorted result from pushed values + * @return Reference of vector with sorted values + */ + const std::vector<int32> &sorted_result() + { + auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; + if (container_.size() <= (size_t)(k_)) + { + std::sort(container_.begin(), container_.end(), comparator); + } + else + { + std::sort_heap(container_.begin(), container_.end() - 1, comparator); + container_.resize(k_); + } + return container_; + } + +private: + int32 k_; + std::vector<int32> container_; + const T *values_ = nullptr; + + bool compare_fun(int32 a, int32 b) const + { + if (values_[b] < values_[a]) + { + return true; + } + else if (values_[b] > values_[a]) + { + return false; + } + else + { + return a < b; + } + } +}; + +/** + * @brief Operates TopK operation with params + * @param [in] row_size Size of row in data + * @param [in] num_rows The number of rows in data + * @param [in] data To be operated in + * @param [in] k The top k predictions + * @param [out] output_indexes Indexes of targets in the top k predictions + * @param [out] output_values Values of targets in the top k predictions + * @return N/A + */ +template <typename T> +void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes, + T *output_values) +{ + TopContainer<T> topc(k, row_size); + for (int row = 0; row < num_rows; ++row) + { + const T *values_row = data + row * row_size; + topc.start_collecting(values_row); + for (int32 c = 0; c < row_size; ++c) + { + topc.push(c); + } + + // Prepare output buffers. + int32 *indexes_row = output_indexes + row * k; + T *output_row = output_values + row * k; + // We always assume that the output is sorted. + const auto &top_k = topc.sorted_result(); + std::copy(top_k.begin(), top_k.end(), indexes_row); + std::transform(top_k.begin(), top_k.end(), output_row, + [values_row](const int32 loc) { return values_row[loc]; }); + } +} + +} // namespace optimized_ops +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ |