summaryrefslogtreecommitdiff
path: root/compute/ARMComputeEx/src/runtime/NEON/functions
diff options
context:
space:
mode:
Diffstat (limited to 'compute/ARMComputeEx/src/runtime/NEON/functions')
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp109
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp70
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp45
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp48
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp44
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp29
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp282
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp477
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp91
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp503
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp47
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp37
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp97
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp39
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp146
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp164
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp164
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp165
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp157
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp99
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp49
-rw-r--r--compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp307
22 files changed, 3169 insertions, 0 deletions
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp
new file mode 100644
index 000000000..5ba465b61
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEArgMinMax.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+
+template <ReductionOperation OP>
+NEArgMinMaxStatic<OP>::NEArgMinMaxStatic(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reduction_kernel(), _reduced_out(), _reshape()
+{
+}
+
+template <ReductionOperation OP>
+Status NEArgMinMaxStatic<OP>::validate(const ITensorInfo *input, int axis,
+ const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+ DataType::F32);
+
+ TensorShape out_shape = input->tensor_shape();
+ const int input_dims = input->num_dimensions();
+ int axis_local = axis;
+
+ // Convert negative axis
+ axis_local = wrap_around(axis_local, input_dims);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local) > input->num_dimensions() - 1);
+ out_shape.remove_dimension(axis_local);
+
+ const TensorInfo out_info = output->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+ return Status{};
+}
+
+template <ReductionOperation OP>
+void NEArgMinMaxStatic<OP>::configure(ITensor *input, int axis, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ int axis_local = axis;
+ const int input_dims = input->info()->num_dimensions();
+
+ // Convert negative axis
+ axis_local = wrap_around(axis_local, input_dims);
+
+ // Perform reduction for axis
+ TensorShape intermediate_shape = input->info()->tensor_shape();
+ intermediate_shape.set(axis_local, 1);
+ auto in = input;
+
+ _reduced_out.allocator()->init(TensorInfo(intermediate_shape, output->info()->num_channels(),
+ output->info()->data_type(),
+ output->info()->quantization_info()));
+ _memory_group.manage(&_reduced_out);
+ _reduction_kernel.configure(in, axis_local, &_reduced_out, OP);
+
+ // Allocate intermediate tensor
+ _reduced_out.allocator()->allocate();
+
+ // Configure reshape layer if we want to drop the dimensions
+ TensorShape out_shape = input->info()->tensor_shape();
+ out_shape.remove_dimension(axis_local);
+ auto_init_if_empty(*output->info(), output->info()->clone()->set_tensor_shape(out_shape));
+ _reshape.configure(&_reduced_out, output);
+}
+
+template <ReductionOperation OP> void NEArgMinMaxStatic<OP>::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ _reduction_kernel.run();
+ _reshape.run();
+}
+
+// Supported Specializations
+template class NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MAX>;
+template class NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MIN>;
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
new file mode 100644
index 000000000..7c15fc453
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h"
+#include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h>
+
+#include "arm_compute/core/ITensor.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+
+template <BinaryLogicalOperation COP>
+void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2,
+ ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+ k->configure(COP, input1, input2, output);
+ _kernel = std::move(k);
+}
+
+template <BinaryLogicalOperation COP>
+Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output)
+{
+ return NEBinaryLogicalOperationKernel::validate(COP, input1, input2, output);
+}
+
+void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output,
+ BinaryLogicalOperation op)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+ k->configure(op, input1, input2, output);
+ _kernel = std::move(k);
+}
+
+Status NEBinaryLogicalOperation::validate(const ITensorInfo *input1, const ITensorInfo *input2,
+ const ITensorInfo *output, BinaryLogicalOperation op)
+{
+ return NEBinaryLogicalOperationKernel::validate(op, input1, input2, output);
+}
+
+// Supported Specializations
+template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>;
+template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>;
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
new file mode 100644
index 000000000..f2490e4e8
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NECast.h"
+
+#include "arm_compute/core/NEON/kernels/NECastKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NECast::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NECastKernel>();
+ k->configure(input, output, input_subtype);
+ _kernel = std::move(k);
+}
+
+Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output,
+ SubDataType input_subtype)
+{
+ return NECastKernel::validate(input, output, input_subtype);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
new file mode 100644
index 000000000..db419e3a8
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+void NEDepthToSpaceLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEDepthToSpaceLayerKernelEx>();
+ k->configure(input, output, block_shape);
+ _kernel = std::move(k);
+}
+
+Status NEDepthToSpaceLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ int32_t block_shape)
+{
+ return NEDepthToSpaceLayerKernelEx::validate(input, output, block_shape);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp
new file mode 100644
index 000000000..a95018a28
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h"
+
+#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NENegLayer::configure(const ITensor *input, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernelEx>();
+ k->configure(ElementWiseUnaryEx::NEG, input, output);
+ _kernel = std::move(k);
+}
+Status NENegLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx::NEG, input, output);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
new file mode 100644
index 000000000..00c3ed94f
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h"
+
+#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEEmbeddingLookupKernel>();
+ k->configure(input, output, lookups);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
new file mode 100644
index 000000000..d604fedbf
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output));
+
+ return Status{};
+}
+} // namespace
+
+void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>();
+ k->configure(input, output);
+ _kernel = std::move(k);
+}
+
+Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input,
+ const ITensorInfo *output)
+{
+ return NETransposeKernel::validate(input, output);
+}
+
+NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer(
+ std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(),
+ _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(),
+ _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false),
+ _accumulate_biases(false), _is_prepared(false)
+{
+}
+
+void NEFullyConnectedHybridLayer::configure_mm(const ITensor *input, const ITensor *weights,
+ ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+ // Configure gemmlowp function
+ _mm_gemmlowp.configure(input, weights, nullptr, output);
+}
+
+void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor *weights,
+ const ITensor *biases, ITensor *output,
+ FullyConnectedLayerInfo fc_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedHybridLayer::validate(
+ input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+ fc_info));
+
+ _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+ _accumulate_biases = false;
+ _original_weights = weights;
+
+ // Configure accumulate biases kernel for non quantized asymmetric types
+ if (biases != nullptr)
+ {
+ _accumulate_biases = true;
+
+ // Configure accumulate biases kernel
+ _accumulate_biases_kernel.configure(output, biases);
+ }
+
+ // With the Fully Connected layer we can have 4 different cases:
+ // 1) Convolution layer -> Fully Connected layer without batches
+ // 2) Fully Connected layer -> Fully Connected layer without batches
+ // 3) Convolution layer -> Fully Connected layer with batches
+ // 4) Fully Connected layer -> Fully Connected layer with batches
+
+ const ITensor *weights_to_use = weights;
+
+ // Check if we have a fully connected layer with batches
+ const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+ bool _is_fc_after_conv;
+ if (is_batched_fc_layer)
+ {
+ _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+ (std::equal(input->info()->tensor_shape().cbegin() + 3,
+ input->info()->tensor_shape().cend(),
+ output->info()->tensor_shape().cbegin() + 1));
+ }
+ else
+ {
+ _is_fc_after_conv = input->info()->num_dimensions() > 1 && input->info()->dimension(1) > 1;
+ }
+ ARM_COMPUTE_ERROR_ON_MSG(_is_fc_after_conv,
+ "NEFullyConnectedHybridLayer does not support after conv");
+ (void)_is_fc_after_conv;
+
+ // Reshape weights if needed
+ if (!_are_weights_reshaped)
+ {
+ // Reshape the weights
+ _reshape_weights_output.allocator()->init(
+ weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+ compute_transposed_shape(*weights->info())));
+ _reshape_weights_function.configure(weights_to_use, &_reshape_weights_output);
+ weights_to_use = &_reshape_weights_output;
+ }
+
+ // Quantize input
+ _quantized_input.allocator()->init(
+ input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+ _scale_factor.allocator()->init(
+ TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32));
+ _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor);
+
+ // GEMM
+ _gemmlowp_output.allocator()->init(
+ output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+ configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output);
+
+ // Multiply scale
+ _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output,
+ weights->info()->quantization_info().scale);
+
+ _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+
+ _quantized_input.allocator()->allocate();
+ _scale_factor.allocator()->allocate();
+ _gemmlowp_output.allocator()->allocate();
+}
+
+Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *biases, const ITensorInfo *output,
+ FullyConnectedLayerInfo fc_info)
+{
+ ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
+
+ bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+
+ const ITensorInfo &reshaped_weights =
+ TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+ compute_transposed_shape(*weights)));
+
+ // Configure accumulate biases kernel for non quantized asymmetric types
+ if (biases != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
+ }
+
+ // With the Fully Connected layer we can have 4 different cases:
+ // 1) Convolution layer -> Fully Connected layer without batches
+ // 2) Fully Connected layer -> Fully Connected layer without batches
+ // 3) Convolution layer -> Fully Connected layer with batches
+ // 4) Fully Connected layer -> Fully Connected layer with batches
+
+ const ITensorInfo *weights_to_use = weights;
+
+ if (!weights_reshaped)
+ {
+ // Validate reshape weights kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
+ weights_to_use = &reshaped_weights;
+ }
+
+ // Fully Connected layer after a Fully Connected Layer without batches
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+
+ // Validate quantization kernel
+ const ITensorInfo &quantized_input = TensorInfo(
+ input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+ const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor));
+
+ const ITensorInfo &gemmlowp_output = TensorInfo(
+ output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+ // Validate matrix multiply kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate(
+ &gemmlowp_output, &scale_factor, output, weights->quantization_info().scale));
+
+ return Status{};
+}
+
+void NEFullyConnectedHybridLayer::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Quantize input
+ NEScheduler::get().schedule(&_quant_input_kernel, Window::DimY);
+
+ // Run matrix multiply
+ _mm_gemmlowp.run();
+
+ // Multiply scale factor
+ NEScheduler::get().schedule(&_multiply_scale_kernel, Window::DimY);
+
+ // Accumulate biases if provided
+ if (_accumulate_biases)
+ {
+ NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+ }
+}
+
+void NEFullyConnectedHybridLayer::prepare()
+{
+ if (!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ auto release_unused = [](Tensor *w) {
+ if (!w->is_used())
+ {
+ w->allocator()->free();
+ }
+ };
+
+ // Reshape of the weights (happens only once)
+ if (!_are_weights_reshaped)
+ {
+ // Run reshape weights kernel and mark weights as unused
+ _reshape_weights_output.allocator()->allocate();
+ _reshape_weights_function.run();
+
+ _are_weights_reshaped = true;
+ // We can not release _original_weights because it can be used in other nodes
+ }
+
+ // Prepare GEMM prepare and release unused weights
+ _mm_gemmlowp.prepare();
+
+ // Release reshaped weights if unused
+ release_unused(&_reshape_weights_output);
+
+ _is_prepared = true;
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
new file mode 100644
index 000000000..a944f699a
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+{
+ if (is_data_type_quantized_asymmetric(input.data_type()))
+ {
+ // Since we need negative offsets for computing convolution, we need to change
+ // QuantizationInfo()
+ // Extract and negate input and weights offset
+ const QuantizationInfo input_quantization_info(input.quantization_info().scale,
+ -input.quantization_info().offset);
+ const QuantizationInfo weights_quantization_info(weights.quantization_info().scale,
+ -weights.quantization_info().offset);
+
+ // Validate gemmlowp function
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(
+ &input.clone()->set_quantization_info(input_quantization_info),
+ &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(
+ &input, &weights, nullptr, &output, 1.f, 0.0f,
+ GEMMInfo(false, false, false /* Reshape weights only for the first run */)));
+ }
+
+ return Status{};
+}
+} // namespace
+
+NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(),
+ _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(),
+ _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(),
+ _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr),
+ _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false),
+ _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
+{
+}
+
+void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *weights,
+ ITensor *output)
+{
+ if (_is_quantized)
+ {
+ // Since we need negative offsets for computing convolution, we need to change
+ // QuantizationInfo()
+ // Extract and negate input and weights offset
+ const QuantizationInfo input_quantization_info = input->info()->quantization_info();
+ const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
+
+ input->info()->set_quantization_info(
+ QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
+ weights->info()->set_quantization_info(
+ QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+
+ // Configure gemmlowp function
+ _mm_gemmlowp.configure(input, weights, nullptr, output);
+
+ // Revert back QuantizatioInfo as input and weights could be used in other fully connected
+ // layers
+ input->info()->set_quantization_info(input_quantization_info);
+ weights->info()->set_quantization_info(weights_quantization_info);
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f,
+ GEMMInfo(false, false, false /* Reshape weights only for the first run */));
+ }
+}
+
+void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITensor *weights,
+ ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON(
+ (weights->info()->dimension(1) !=
+ (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+ // If the fully connected layer is called after a convolution layer, the input tensor must be
+ // linearized
+
+ // Initialize output tensor for flatten
+ TensorShape shape_flatten = compute_flatten_shape(input->info());
+ _flatten_output.allocator()->init(
+ input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+ shape_flatten));
+
+ // Configure flatten kernel
+ _memory_group.manage(&_flatten_output);
+ _flatten_kernel.configure(input, &_flatten_output);
+
+ // Configure matrix multiply kernel
+ configure_mm(&_flatten_output, weights, output);
+
+ // Allocate the output tensor for flatten once all the configure methods have been called
+ _flatten_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayerEx::configure_fc_fc(const ITensor *input, const ITensor *weights,
+ ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+ // Configure matrix multiply kernel
+ configure_mm(input, weights, output);
+}
+
+void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *weights,
+ const ITensor *biases, ITensor *output,
+ FullyConnectedLayerInfo fc_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate(
+ input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+ fc_info));
+
+ _are_weights_converted = true;
+ _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+ _is_fc_after_conv = true;
+ _accumulate_biases = false;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+ _original_weights = weights;
+
+ // Configure gemmlowp output
+ if (_is_quantized)
+ {
+ _gemmlowp_output.allocator()->init(
+ output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+ DataType::S32));
+ }
+
+ // Configure accumulate biases kernel for non quantized asymmetric types
+ if (biases != nullptr && !_is_quantized)
+ {
+ _accumulate_biases = true;
+
+ // Configure accumulate biases kernel
+ _accumulate_biases_kernel.configure(output, biases);
+ }
+
+ // With the Fully Connected layer we can have 4 different cases:
+ // 1) Convolution layer -> Fully Connected layer without batches
+ // 2) Fully Connected layer -> Fully Connected layer without batches
+ // 3) Convolution layer -> Fully Connected layer with batches
+ // 4) Fully Connected layer -> Fully Connected layer with batches
+
+ const ITensor *weights_to_use = weights;
+
+ // Check if we have a fully connected layer with batches
+ const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+ if (is_batched_fc_layer)
+ {
+ _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+ (std::equal(input->info()->tensor_shape().cbegin() + 3,
+ input->info()->tensor_shape().cend(),
+ output->info()->tensor_shape().cbegin() + 1));
+ }
+ else
+ {
+ _is_fc_after_conv = input->info()->num_dimensions() > 1;
+ }
+
+ // Reshape weights if needed
+ if (!_are_weights_reshaped)
+ {
+ // Reshape the weights
+ _reshape_weights_function.configure(weights, &_reshape_weights_output);
+ weights_to_use = &_reshape_weights_output;
+ }
+
+ // Convert weights if needed
+ if (_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
+ {
+ // Convert weights
+ _convert_weights.configure(weights_to_use, &_converted_weights_output,
+ input->info()->tensor_shape(), fc_info.weights_trained_layout);
+
+ weights_to_use = &_converted_weights_output;
+ _are_weights_converted = false;
+ }
+
+ ITensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output;
+ if (_is_fc_after_conv)
+ {
+ // Fully Connected layer after a Convolution Layer without batches
+ configure_conv_fc(input, weights_to_use, tmp_output);
+ }
+ else
+ {
+ // Fully Connected layer after a Fully Connected Layer without batches
+ configure_fc_fc(input, weights_to_use, tmp_output);
+ }
+
+ // Configure output stage for asymmetric quantized types
+ if (_is_quantized)
+ {
+ float multiplier = input->info()->quantization_info().scale *
+ weights->info()->quantization_info().scale /
+ output->info()->quantization_info().scale;
+ int output_multiplier;
+ int output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier,
+ &output_shift);
+ _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier,
+ output_shift, output->info()->quantization_info().offset);
+ _gemmlowp_output.allocator()->allocate();
+ }
+
+ _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+}
+
+Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *biases, const ITensorInfo *output,
+ FullyConnectedLayerInfo fc_info)
+{
+ ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+
+ bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+ bool is_fc_after_conv = true;
+ bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+
+ const ITensorInfo &flatten_input =
+ TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+ compute_flatten_shape(input)));
+ const ITensorInfo &reshaped_weights =
+ TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+ compute_transposed_shape(*weights)));
+ const ITensorInfo &converted_weights =
+ weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+ : TensorInfo(*reshaped_weights.clone());
+ const ITensorInfo &gemmlowp_output = TensorInfo(
+ output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+
+ // Configure accumulate biases kernel for non quantized asymmetric types
+ if (biases != nullptr && !is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
+ }
+
+ // With the Fully Connected layer we can have 4 different cases:
+ // 1) Convolution layer -> Fully Connected layer without batches
+ // 2) Fully Connected layer -> Fully Connected layer without batches
+ // 3) Convolution layer -> Fully Connected layer with batches
+ // 4) Fully Connected layer -> Fully Connected layer with batches
+
+ const ITensorInfo *input_to_use = input;
+ const ITensorInfo *weights_to_use = weights;
+ const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output;
+
+ // Check if we have a fully connected layer with batches
+ const bool is_batched_fc_layer = output->dimension(1) > 1;
+
+ if (is_batched_fc_layer)
+ {
+ is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+ (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(),
+ output->tensor_shape().cbegin() + 1));
+ }
+ else
+ {
+ is_fc_after_conv = input->num_dimensions() > 1;
+ }
+
+ if (!weights_reshaped)
+ {
+ // Validate reshape weights kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
+ weights_to_use = &reshaped_weights;
+ }
+
+ if (is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
+ {
+ // Validate convert weights kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(
+ weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
+ weights_to_use = &converted_weights;
+ }
+
+ if (is_fc_after_conv)
+ {
+ // Fully Connected layer after a Convolution Layer without batches
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ (weights_to_use->dimension(1) !=
+ (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+
+ // Validate flatten kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input));
+ input_to_use = &flatten_input;
+ }
+ else
+ {
+ // Fully Connected layer after a Fully Connected Layer without batches
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+ }
+ // Validate matrix multiply kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output));
+
+ // Validate output stage for asymmetric quantized types
+ if (is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(
+ &gemmlowp_output, biases, output));
+ }
+
+ return Status{};
+}
+
+void NEFullyConnectedLayerEx::run()
+{
+ if (!_is_prepared)
+ {
+ if (!_are_weights_reshaped)
+ _reshape_weights_output.allocator()->allocate();
+ if (!_are_weights_converted)
+ _converted_weights_output.allocator()->allocate();
+ _is_prepared = true;
+ }
+
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Reshape of the weights
+ if (!_are_weights_reshaped)
+ {
+ _reshape_weights_function.run();
+ }
+
+ // Convert weights if needed
+ if (!_are_weights_converted)
+ {
+ _convert_weights.run();
+ }
+
+ // Prepare GEMM prepare
+ if (!_is_quantized)
+ {
+ _mm_gemm.prepare();
+ }
+ }
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Linearize input if it comes from a convolutional layer
+ if (_is_fc_after_conv)
+ {
+ NEScheduler::get().schedule(&_flatten_kernel, Window::DimY);
+ }
+
+ // Run matrix multiply
+ if (_is_quantized)
+ {
+ _mm_gemmlowp.run();
+ }
+ else
+ {
+ _mm_gemm.run();
+ }
+
+ // Accumulate biases if provided
+ if (_is_quantized)
+ {
+ _gemmlowp_output_stage.run();
+ }
+ else
+ {
+ if (_accumulate_biases)
+ {
+ NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+ }
+ }
+}
+
+void NEFullyConnectedLayerEx::prepare()
+{
+#if 0 // TODO Remove this block
+ if (!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ auto release_unused = [](Tensor *w) {
+ if (!w->is_used())
+ {
+ w->allocator()->free();
+ }
+ };
+
+ // Pointer to current weights
+ const ITensor *cur_weights = _original_weights;
+
+ // Reshape of the weights (happens only once)
+ if (!_are_weights_reshaped)
+ {
+ // Run reshape weights kernel and mark weights as unused
+ _reshape_weights_output.allocator()->allocate();
+ _reshape_weights_function.run();
+
+ cur_weights->mark_as_unused();
+ cur_weights = &_reshape_weights_output;
+ _are_weights_reshaped = true;
+ }
+
+ // Convert weights if needed (happens only once)
+ if (!_are_weights_converted)
+ {
+ _converted_weights_output.allocator()->allocate();
+ _convert_weights.run();
+
+ cur_weights->mark_as_unused();
+ _are_weights_converted = true;
+ }
+
+ // Release reshaped weights if unused
+ release_unused(&_reshape_weights_output);
+
+ // Prepare GEMM prepare and release unused weights
+ if (!_is_quantized)
+ {
+ _mm_gemm.prepare();
+ }
+
+ // Release converted weights if unused
+ release_unused(&_reshape_weights_output);
+ release_unused(&_converted_weights_output);
+
+ _is_prepared = true;
+ }
+#endif
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
new file mode 100644
index 000000000..fcac3c7ae
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h"
+
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h>
+
+using namespace arm_compute;
+
+void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input,
+ const arm_compute::ITensor *weights,
+ const arm_compute::ITensor *biases,
+ arm_compute::ITensor *output, bool needs_reshape,
+ const arm_compute::TensorShape &reshape,
+ KernelType kernel_type)
+{
+ _input = input;
+ _weights = weights;
+ _biases = biases;
+ _output = output;
+ _needs_reshape = needs_reshape;
+
+ const ITensor *input_to_use = input;
+ if (_needs_reshape)
+ {
+ // reshape
+ auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
+ _neon_reshape.configure(_input, &_neon_buffer);
+ input_to_use = &_neon_buffer;
+ }
+
+ _neon_fc = [&]() {
+ if (kernel_type == KernelType::GENERAL)
+ {
+ auto fc = new arm_compute::NEFullyConnectedLayerEx{_memory_manager};
+ fc->configure(input_to_use, _weights, _biases, _output);
+ return std::unique_ptr<arm_compute::IFunction>(fc);
+ }
+ else
+ {
+ assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
+
+ bool is_hybrid = input->info()->data_type() == DataType::F32 &&
+ weights->info()->data_type() == DataType::S8;
+
+ if (is_hybrid)
+ {
+ auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager};
+ fc->configure(input_to_use, _weights, _biases, _output);
+ return std::unique_ptr<arm_compute::IFunction>(fc);
+ }
+ else
+ {
+ auto fc = new arm_compute::NEFullyConnectedLayer{_memory_manager};
+ fc->configure(input_to_use, _weights, _biases, _output);
+ return std::unique_ptr<arm_compute::IFunction>(fc);
+ }
+ }
+ }();
+
+ // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
+ if (_needs_reshape)
+ {
+ _neon_buffer.allocator()->allocate();
+ }
+}
+
+void NEFullyConnectedReshapingLayer::run(void)
+{
+ if (_needs_reshape)
+ _neon_reshape.run();
+
+ _neon_fc->run();
+}
+
+void NEFullyConnectedReshapingLayer::prepare(void) { _neon_fc->prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
new file mode 100644
index 000000000..11794a1ea
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+NEGEMMLowpMatrixMultiplyCoreEx::NEGEMMLowpMatrixMultiplyCoreEx(
+ std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr),
+ _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
+ _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
+ _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(),
+ _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0),
+ _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),
+ _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false),
+ _fuse_output_stage(false), _run_activation(false), _flip_signedness(false)
+{
+}
+
+void NEGEMMLowpMatrixMultiplyCoreEx::configure(const ITensor *a, const ITensor *b, const ITensor *c,
+ ITensor *output, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+ ARM_COMPUTE_UNUSED(c);
+ ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCoreEx::validate(
+ a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
+
+ const ITensor *matrix_a = a;
+ const ITensor *matrix_b = b;
+ GEMMInfo info = gemm_info;
+
+ // Clear state
+ _mtx_a_reshape_kernel = nullptr;
+ _mtx_b_reshape_kernel = nullptr;
+
+ // Set internal variables
+ _a_offset = a->info()->quantization_info().offset;
+ _b_offset = b->info()->quantization_info().offset;
+ _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+ _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
+ _is_prepared = false;
+ _fused_assembly_path = false;
+ _original_b = b;
+
+ const ITensor *a_to_use = a;
+
+ // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+ if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+ {
+ _fuse_output_stage = true;
+ _memory_group.manage(&_mm_result_s32);
+ TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
+ _mm_result_s32.allocator()->init(info_mm_result_s32);
+ }
+
+#ifdef __aarch64__
+#if 0 // Can use after arm compute library v19.11
+ switch (a->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
+ case DataType::U8:
+ case DataType::S8:
+ {
+ if (a_to_use->info()->data_type() == DataType::QASYMM8 &&
+ info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ _asm_glue.configure(a_to_use, b, c, output, gemm_info);
+ _fused_assembly_path = _asm_glue.is_configured();
+ }
+ else
+ {
+ _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output,
+ gemm_info);
+ }
+ _assembly_path = _asm_glue.is_configured();
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Datatype not supported");
+ break;
+ }
+ }
+#endif // 0
+ ARM_COMPUTE_ERROR("aarch64 not supported");
+#endif /* __aarch64__ */
+ if (!(_assembly_path || _run_vector_matrix_multiplication))
+ {
+ matrix_a = &_tmp_a;
+ matrix_b = &_tmp_b;
+
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
+ // 4.0f) ]
+ TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1,
+ a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width /
+ // 16.0f) ]
+ TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(),
+ b->info()->quantization_info());
+ _tmp_a.allocator()->init(a_info);
+ _tmp_b.allocator()->init(b_info);
+ _memory_group.manage(&_tmp_a);
+ if (!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_tmp_b);
+ }
+
+ // Configure interleave kernel
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+ k->configure(a_to_use, &_tmp_a);
+ _mtx_a_reshape_kernel = std::move(k);
+ }
+
+ // Configure transpose kernel
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+ k->configure(b, &_tmp_b);
+ _mtx_b_reshape_kernel = std::move(k);
+ }
+ }
+
+ if (!_fused_assembly_path)
+ {
+ // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+ if (_a_offset != 0)
+ {
+ TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
+
+ _vector_sum_col.allocator()->init(info_vector_sum_col);
+ if (!_reshape_b_only_on_first_run)
+ {
+ _memory_group.manage(&_vector_sum_col);
+ }
+
+ // Configure Matrix B reduction kernel
+ _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);
+ }
+
+ // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+ if (_b_offset != 0)
+ {
+ TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
+
+ _vector_sum_row.allocator()->init(info_vector_sum_row);
+ _memory_group.manage(&_vector_sum_row);
+
+ // Configure matrix A reduction kernel
+ _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0),
+ false);
+ }
+
+ if (_fuse_output_stage)
+ {
+ // Configure matrix multiply kernel
+ if (!_assembly_path)
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+ k->configure(matrix_a, matrix_b, &_mm_result_s32);
+ _mm_kernel = std::move(k);
+ }
+
+ _offset_contribution_output_stage_kernel.configure(
+ &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
+ _b_offset == 0 ? nullptr : &_vector_sum_row, c,
+ _flip_signedness ? &_signed_output : output, a->info()->dimension(0), _a_offset,
+ _b_offset, info.gemmlowp_output_stage());
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ if (!_assembly_path)
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+ k->configure(matrix_a, matrix_b, output);
+ _mm_kernel = std::move(k);
+ }
+ // Configure offset contribution kernel
+ _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col,
+ _b_offset == 0 ? nullptr : &_vector_sum_row,
+ a_to_use->info()->dimension(0), _a_offset, _b_offset);
+ }
+ }
+
+ // Allocate tensors
+ if (!_assembly_path && !_run_vector_matrix_multiplication)
+ {
+ _tmp_a.allocator()->allocate();
+ if (!_reshape_b_only_on_first_run)
+ {
+ _tmp_b.allocator()->allocate();
+ }
+ }
+
+ if (!_fused_assembly_path)
+ {
+ if (_a_offset != 0 && !_reshape_b_only_on_first_run)
+ {
+ _vector_sum_col.allocator()->allocate();
+ }
+
+ if (_b_offset != 0)
+ {
+ _vector_sum_row.allocator()->allocate();
+ }
+ }
+
+ if (_fuse_output_stage)
+ {
+ _mm_result_s32.allocator()->allocate();
+ }
+}
+
+Status NEGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b,
+ const ITensorInfo *c, const ITensorInfo *output,
+ const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::S8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,
+ "Bias addition not supported in NEGEMMLowpMatrixMultiplyCoreEx for output S32");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
+ "The product AB is defined only if the number of columns in A is "
+ "equal to the number of rows in B");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(),
+ "Matrix A already reshaped is not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(),
+ "Matrix B already reshaped is not supported");
+
+ GEMMInfo info = gemm_info;
+ const ITensorInfo *matrix_a_info = a;
+ const ITensorInfo *matrix_b_info = b;
+
+ const ITensorInfo *a_to_use = a;
+
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+ TensorInfo mm_result_s32_info{};
+
+ int32_t a_offset = a->quantization_info().offset;
+ int32_t b_offset = b->quantization_info().offset;
+
+ bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+ if (fuse_output_stage)
+ {
+ auto_init_if_empty(
+ mm_result_s32_info,
+ a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+ }
+
+ // Check if we need to run the optimized assembly kernel
+ bool run_optimised = false;
+ bool run_optimised_requantized = false;
+ const bool reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
+ if (a_to_use->data_type() == DataType::QASYMM8 &&
+ info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, output, 1.f, 0.f,
+ reshape_b_only_on_first_run));
+ run_optimised_requantized = run_optimised;
+ }
+ else
+ {
+ run_optimised = bool(NEGEMMAssemblyDispatch::validate(
+ a_to_use, b, fuse_output_stage ? &mm_result_s32_info : output, 1.f, 0.f,
+ reshape_b_only_on_first_run));
+ }
+
+ if (run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
+ if (info.depth_output_gemm3d() != 0)
+ {
+ if (info.reinterpret_input_as_3d())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),
+ "NEGEMM cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,
+ "NEGEMM cannot reinterpret the output tensor as 3D");
+
+ const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
+ if (!run_vector_matrix_multiplication)
+ {
+ matrix_a_info = &tmp_a_info;
+ matrix_b_info = &tmp_b_info;
+
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
+ // 4.0f) ]
+ TensorShape shape_tmp_a = a->tensor_shape();
+ shape_tmp_a.set(0, a->dimension(0) * 4);
+ shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width
+ // / 16.0f) ]
+ TensorShape shape_tmp_b = b->tensor_shape();
+ shape_tmp_b.set(0, b->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+ // Validate interleave kernel
+ auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
+ }
+ }
+
+ if (!run_optimised_requantized)
+ {
+ TensorInfo info_vector_sum_col{};
+ TensorInfo info_vector_sum_row{};
+
+ // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+ if (a_offset != 0)
+ {
+ info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+
+ // Configure Matrix B reduction kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(
+ b, &info_vector_sum_col, a->dimension(0), false));
+ }
+
+ // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+ if (b_offset != 0)
+ {
+ info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
+
+ // Configure matrix A reduction kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(
+ a_to_use, &info_vector_sum_row, a->dimension(0), false));
+ }
+
+ if (fuse_output_stage)
+ {
+ if (!run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(
+ matrix_a_info, matrix_b_info, &mm_result_s32_info));
+ }
+
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(
+ &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset,
+ info.gemmlowp_output_stage()));
+ }
+ else
+ {
+ if (!run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+ }
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(
+ output, a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row, a_offset, b_offset));
+ }
+ }
+ return Status{};
+}
+
+void NEGEMMLowpMatrixMultiplyCoreEx::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Reshape inputs
+ if (_mtx_a_reshape_kernel)
+ {
+ NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+ }
+ if (_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)
+ {
+ NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+ }
+
+ // Run GEMM
+ if (_asm_glue.is_configured())
+ {
+ _asm_glue.run();
+ }
+ else
+ {
+ NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+ }
+
+ if (!_fused_assembly_path)
+ {
+ // Run matrix A reduction kernel only if _b_offset is not equal to 0
+ if (_b_offset != 0)
+ {
+ NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
+ }
+
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if (_a_offset != 0 && !_reshape_b_only_on_first_run)
+ {
+ NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+ }
+
+ if (_fuse_output_stage)
+ {
+ // Run offset contribution kernel
+ NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
+ }
+ else
+ {
+ // Run offset contribution kernel
+ NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+ }
+ }
+}
+
+void NEGEMMLowpMatrixMultiplyCoreEx::prepare()
+{
+ if (!_is_prepared)
+ {
+ // Run assembly reshape
+ if (_asm_glue.is_configured() && _reshape_b_only_on_first_run)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+ _asm_glue.prepare();
+ _original_b->mark_as_unused();
+ }
+ // Run non-assembly reshape
+ else if (_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+ // Run reshape kernel and mark original weights tensor as unused
+ _tmp_b.allocator()->allocate();
+ NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+ _original_b->mark_as_unused();
+ }
+
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if (_a_offset != 0 && _reshape_b_only_on_first_run)
+ {
+ _vector_sum_col.allocator()->allocate();
+ NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+ }
+
+ _is_prepared = true;
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
new file mode 100644
index 000000000..90dabb35a
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGatherEx.h"
+
+#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEGatherKernelEx>();
+ k->configure(input, indices, output, axis);
+ _kernel = std::move(k);
+}
+
+Status NEGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
+ const ITensorInfo *output, int axis)
+{
+ return NEGatherKernelEx::validate(input, indices, output, axis);
+}
+
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
new file mode 100644
index 000000000..624185d2c
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h"
+
+#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input,
+ ITensor *output, ITensor *hits)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEHashtableLookupKernel>();
+ k->configure(lookups, keys, input, output, hits);
+ _kernel = std::move(k);
+}
+
+Status NEHashtableLookup::validate(const ITensorInfo *lookups, const ITensorInfo *keys,
+ const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *hits)
+{
+ return NEHashtableLookupKernel::validate(lookups, keys, input, output, hits);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
new file mode 100644
index 000000000..1c2c8f027
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEInstanceNormalizationLayerEx::NEInstanceNormalizationLayerEx(
+ std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false),
+ _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
+{
+}
+
+void NEInstanceNormalizationLayerEx::configure(ITensor *input, ITensor *output, ITensor *gamma,
+ ITensor *beta, float epsilon)
+{
+ const DataLayout data_layout = input->info()->data_layout();
+
+ // Configure Kernels
+ _is_nchw = data_layout == DataLayout::NCHW;
+
+ if (!_is_nchw)
+ {
+ _memory_group.manage(&_permuted_input);
+ _memory_group.manage(&_permuted_output);
+
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+ _normalization_kernel.configure(&_permuted_input, &_permuted_output, gamma, beta, epsilon);
+ _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+
+ _permute_output.configure(&_permuted_output, output != nullptr ? output : input,
+ PermutationVector(2U, 0U, 1U));
+ _permuted_input.allocator()->allocate();
+ _permuted_output.allocator()->allocate();
+ }
+ else
+ {
+ _normalization_kernel.configure(input, output, gamma, beta, epsilon);
+ }
+}
+
+Status NEInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ const ITensorInfo *gamma, const ITensorInfo *beta,
+ float epsilon)
+{
+ return NEInstanceNormalizationLayerKernelEx::validate(
+ &input->clone()->set_data_layout(DataLayout::NCHW),
+ &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon);
+}
+
+void NEInstanceNormalizationLayerEx::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Permute input
+ if (!_is_nchw)
+ {
+ _permute_input.run();
+ }
+
+ NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ);
+
+ // Permute output
+ if (!_is_nchw)
+ {
+ _permute_output.run();
+ }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
new file mode 100644
index 000000000..1150cef76
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPReLU.h"
+
+#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEPReLU::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NEPReLUKernel>();
+ k->configure(input, alpha, output);
+ _kernel = std::move(k);
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
new file mode 100644
index 000000000..84411c266
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NERNNLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NERNNLayerEx::NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
+ _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
+ _gemm_output(), _add_output(), _is_prepared(false)
+{
+}
+
+Status NERNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
+ const ITensorInfo *hidden_state, const ITensorInfo *output,
+ const ActivationLayerInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
+ output);
+
+ const int idx_width = 0;
+ const int idx_height = 1;
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
+ recurrent_weights->dimension(idx_width));
+ ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
+ recurrent_weights->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+ hidden_state->tensor_shape());
+
+ auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(
+ recurrent_weights, hidden_state->dimension(idx_height)),
+ 1, input->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(
+ &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info));
+
+ return Status{};
+}
+
+void NERNNLayerEx::configure(const ITensor *input, const ITensor *weights,
+ const ITensor *recurrent_weights, const ITensor *bias,
+ ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NERNNLayerEx::validate(input->info(), weights->info(),
+ recurrent_weights->info(), bias->info(),
+ hidden_state->info(), output->info(), info));
+
+ const int idx_height = 1;
+ TensorShape shape = misc::shape_calculator::compute_rnn_shape(
+ recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+
+ _is_prepared = false;
+
+ // Manage intermediate buffers and configure
+ _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+ _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+
+ // Manage intermediate buffers and configure
+ _memory_group.manage(&_fully_connected_out);
+ _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
+
+ _memory_group.manage(&_gemm_output);
+ _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+
+ _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+ _memory_group.manage(&_add_output);
+
+ _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output,
+ ConvertPolicy::SATURATE);
+
+ _fully_connected_out.allocator()->allocate();
+ _gemm_output.allocator()->allocate();
+
+ _activation_kernel.configure(&_add_output, hidden_state, info);
+ _add_output.allocator()->allocate();
+
+ _copy_kernel.configure(hidden_state, output);
+}
+
+void NERNNLayerEx::run()
+{
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ _fully_connected_kernel.run();
+
+ _gemm_state_f.run();
+
+ NEScheduler::get().schedule(&_add_kernel, Window::DimY);
+ NEScheduler::get().schedule(&_activation_kernel, Window::DimY);
+
+ // copy hidden out to output
+ NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
+}
+
+void NERNNLayerEx::prepare()
+{
+ if (!_is_prepared)
+ {
+ _fully_connected_kernel.prepare();
+ _gemm_state_f.prepare();
+
+ _is_prepared = true;
+ }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
new file mode 100644
index 000000000..c65e93570
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReduceMeanEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEReduceMeanEx::NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+ _reduction_ops(), _keep_dims()
+{
+}
+
+Status NEReduceMeanEx::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+ bool keep_dims, const ITensorInfo *output)
+{
+ ARM_COMPUTE_UNUSED(keep_dims);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+ TensorShape out_shape = input->tensor_shape();
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+ const int input_dims = input->num_dimensions();
+ Coordinates axis_local = reduction_axis;
+
+ // Convert negative axis
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
+ }
+
+ std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
+ input->num_dimensions() - 1);
+ if (output->total_size() > 0 && keep_dims)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+ }
+ if (keep_dims)
+ {
+ out_shape.set(axis_local[i], 1);
+ }
+ else
+ {
+ out_shape.remove_dimension(axis_local[i] - i);
+ }
+ }
+ const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+ return Status{};
+}
+
+void NEReduceMeanEx::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+ ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ _reduction_ops = reduction_axis.num_dimensions();
+ _reduction_kernels =
+ arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops);
+ _reduced_outs =
+ arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
+ _keep_dims = keep_dims;
+
+ Coordinates axis_local = reduction_axis;
+ const int input_dims = input->info()->num_dimensions();
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+ // Convert negative axis
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
+ }
+
+ // Perform reduction for every axis
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ TensorShape out_shape = i == 0 ? input->info()->tensor_shape()
+ : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+ out_shape.set(axis_local[i], 1);
+ auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+
+ if (i == _reduction_ops - 1 && keep_dims)
+ {
+ _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
+ }
+ else
+ {
+ _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
+ input->info()->data_type(),
+ input->info()->quantization_info())
+ .set_data_layout(output->info()->data_layout()));
+ _memory_group.manage(_reduced_outs.get() + i);
+ _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i],
+ ReductionOperation::MEAN_SUM);
+ }
+ }
+
+ // Allocate intermediate tensors
+ for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+ {
+ _reduced_outs[i].allocator()->allocate();
+ }
+
+ // Configure reshape layer if we want to drop the dimensions
+ if (!keep_dims)
+ {
+ TensorShape out_shape = input->info()->tensor_shape();
+
+ // We have to sort the reduction axis vectors in order for remove_dimension
+ // to work properly
+ std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ out_shape.remove_dimension(axis_local[i] - i);
+ }
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+ _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+ }
+}
+
+void NEReduceMeanEx::run()
+{
+ _memory_group.acquire();
+
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ _reduction_kernels[i].run();
+ }
+
+ if (!_keep_dims)
+ {
+ _reshape.run();
+ }
+ _memory_group.release();
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
new file mode 100644
index 000000000..b36f8287a
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReduceOperation.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEReduceOperation::NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+ _reduction_ops(), _keep_dims()
+{
+}
+
+Status NEReduceOperation::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+ bool keep_dims, const ITensorInfo *output, ReduceOperation op)
+{
+ ARM_COMPUTE_UNUSED(keep_dims);
+ ARM_COMPUTE_UNUSED(op);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+ TensorShape out_shape = input->tensor_shape();
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+ const int input_dims = input->num_dimensions();
+ Coordinates axis_local = reduction_axis;
+
+ // Convert negative axis
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
+ }
+
+ std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
+ input->num_dimensions() - 1);
+ if (output->total_size() > 0 && keep_dims)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+ }
+ if (keep_dims)
+ {
+ out_shape.set(axis_local[i], 1);
+ }
+ else
+ {
+ out_shape.remove_dimension(axis_local[i] - i);
+ }
+ }
+ const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+ return Status{};
+}
+
+void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+ ITensor *output, ReduceOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ _reduction_ops = reduction_axis.num_dimensions();
+ _reduction_kernels.resize(_reduction_ops);
+ _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+ _keep_dims = keep_dims;
+
+ Coordinates axis_local = reduction_axis;
+ const int input_dims = input->info()->num_dimensions();
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+ // Convert negative axis
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
+ }
+
+ // Perform reduction for every axis
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ TensorShape out_shape =
+ i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+ out_shape.set(axis_local[i], 1);
+ auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
+
+ if (i == _reduction_ops - 1 && keep_dims)
+ {
+ _reduction_kernels[i].configure(in, output, axis_local[i], op);
+ }
+ else
+ {
+ _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
+ input->info()->data_type(),
+ input->info()->quantization_info()));
+ _memory_group.manage(&_reduced_outs[i]);
+ _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], op);
+ }
+ }
+
+ // Allocate intermediate tensors
+ for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+ {
+ _reduced_outs[i].allocator()->allocate();
+ }
+
+ // Configure reshape layer if we want to drop the dimensions
+ if (!keep_dims)
+ {
+ TensorShape out_shape = input->info()->tensor_shape();
+
+ // We have to sort the reduction axis vectors in order for remove_dimension
+ // to work properly
+ std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ out_shape.remove_dimension(axis_local[i] - i);
+ }
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+ _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
+ }
+}
+
+void NEReduceOperation::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ _reduction_kernels[i].run();
+ }
+
+ if (!_keep_dims)
+ {
+ _reshape.run();
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
new file mode 100644
index 000000000..3c18217ef
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReduceSum.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEReduceSum::NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
+ _reduction_ops(), _keep_dims()
+{
+}
+
+Status NEReduceSum::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
+ bool keep_dims, const ITensorInfo *output)
+{
+ ARM_COMPUTE_UNUSED(keep_dims);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+ TensorShape out_shape = input->tensor_shape();
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+ const int input_dims = input->num_dimensions();
+ Coordinates axis_local = reduction_axis;
+
+ // Convert negative axis
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
+ }
+
+ std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
+ input->num_dimensions() - 1);
+ if (output->total_size() > 0 && keep_dims)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+ }
+ if (keep_dims)
+ {
+ out_shape.set(axis_local[i], 1);
+ }
+ else
+ {
+ out_shape.remove_dimension(axis_local[i] - i);
+ }
+ }
+ const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+
+ return Status{};
+}
+
+void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
+ ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+ _reduction_ops = reduction_axis.num_dimensions();
+ _reduction_kernels.resize(_reduction_ops);
+ _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
+ _keep_dims = keep_dims;
+
+ Coordinates axis_local = reduction_axis;
+ const int input_dims = input->info()->num_dimensions();
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+
+ // Convert negative axis
+ for (unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ axis_local[i] = wrap_around(axis_local[i], input_dims);
+ }
+
+ // Perform reduction for every axis
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ TensorShape out_shape =
+ i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+ out_shape.set(axis_local[i], 1);
+ auto in = (i == 0) ? input : (&_reduced_outs[i - 1]);
+
+ if (i == _reduction_ops - 1 && keep_dims)
+ {
+ _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::SUM);
+ }
+ else
+ {
+ _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
+ input->info()->data_type(),
+ input->info()->quantization_info())
+ .set_data_layout(input->info()->data_layout()));
+ _memory_group.manage(&_reduced_outs[i]);
+ _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i],
+ ReductionOperation::SUM);
+ }
+ }
+
+ // Allocate intermediate tensors
+ for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+ {
+ _reduced_outs[i].allocator()->allocate();
+ }
+
+ // Configure reshape layer if we want to drop the dimensions
+ if (!keep_dims)
+ {
+ TensorShape out_shape = input->info()->tensor_shape();
+
+ // We have to sort the reduction axis vectors in order for remove_dimension
+ // to work properly
+ std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ out_shape.remove_dimension(axis_local[i] - i);
+ }
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+ _reshape.configure(&_reduced_outs[_reduction_ops - 1], output);
+ }
+}
+
+void NEReduceSum::run()
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ for (unsigned int i = 0; i < _reduction_ops; ++i)
+ {
+ _reduction_kernels[i].run();
+ }
+
+ if (!_keep_dims)
+ {
+ _reshape.run();
+ }
+}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp
new file mode 100644
index 000000000..c3431c418
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+namespace
+{
+/** Define dimension to split the window
+ *
+ * @param[in] axis Reduction axis
+ *
+ * @return The dimension to split the window
+ */
+size_t reduction_window_split_dimension(unsigned int axis)
+{
+ switch (axis)
+ {
+ case 0:
+ return Window::DimY;
+ case 1:
+ case 2:
+ case 3:
+ return Window::DimX;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported reduction axis");
+ }
+}
+} // namespace
+
+NEReductionOperationEx::NEReductionOperationEx()
+ : _reduction_kernel(), _fill_border_kernel(), _window_split(0), _reduction_axis()
+{
+}
+
+Status NEReductionOperationEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ unsigned int axis, ReduceOperation op)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernelEx::validate(input, output, axis, op));
+
+ return Status{};
+}
+
+void NEReductionOperationEx::configure(ITensor *input, ITensor *output, unsigned int axis,
+ ReduceOperation op)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(
+ NEReductionOperationEx::validate(input->info(), output->info(), axis, op));
+
+ // Configure reduction kernel
+ _reduction_kernel.configure(input, output, axis, op);
+ _window_split = reduction_window_split_dimension(axis);
+ _reduction_axis = axis;
+
+ if (axis == 0)
+ {
+ // Configure fill border kernel
+ const BorderSize fill_border_size = _reduction_kernel.border_size();
+ PixelValue pixelValue;
+ switch (op)
+ {
+ case ReduceOperation::MIN:
+ {
+ switch (input->info()->data_type())
+ {
+ case DataType::F32:
+ {
+ pixelValue = PixelValue(std::numeric_limits<float>::max());
+ break;
+ }
+ case DataType::F16:
+ {
+ pixelValue = PixelValue(static_cast<half>(65504.0f));
+ break;
+ }
+ case DataType::QASYMM8:
+ {
+ pixelValue =
+ PixelValue(255, input->info()->data_type(), input->info()->quantization_info());
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported DataType");
+ }
+ }
+ break;
+ }
+ case ReduceOperation::MAX:
+ {
+ switch (input->info()->data_type())
+ {
+ case DataType::F32:
+ {
+ pixelValue = PixelValue(-std::numeric_limits<float>::max());
+ break;
+ }
+ case DataType::F16:
+ {
+ pixelValue = PixelValue(static_cast<half>(-65504.0f));
+ break;
+ }
+ case DataType::QASYMM8:
+ {
+ pixelValue =
+ PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported DataType");
+ }
+ }
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Reduction Operation unsupported");
+ }
+ _fill_border_kernel.configure(input, fill_border_size, BorderMode::CONSTANT, pixelValue);
+ }
+}
+
+void NEReductionOperationEx::run()
+{
+ if (_reduction_axis == 0)
+ {
+ NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+ }
+ NEScheduler::get().schedule(&_reduction_kernel, _window_split);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
new file mode 100644
index 000000000..c9f914fb0
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NESpaceToBatchLayerEx::NESpaceToBatchLayerEx()
+ : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
+{
+}
+
+void NESpaceToBatchLayerEx::configure(const ITensor *input, const ITensor *block_shape,
+ const ITensor *paddings, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
+
+ if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ {
+ _has_padding = true;
+ _memset_kernel.configure(
+ output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
+ }
+ _space_to_batch_kernel.configure(input, block_shape, paddings, output);
+}
+
+void NESpaceToBatchLayerEx::configure(const ITensor *input, const int block_shape_x,
+ const int block_shape_y, const Size2D &padding_left,
+ const Size2D &padding_right, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ {
+ _has_padding = true;
+ _memset_kernel.configure(
+ output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
+ }
+ _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right,
+ output);
+}
+
+Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const ITensorInfo *block_shape,
+ const ITensorInfo *paddings, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
+
+ return Status{};
+}
+
+Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const int block_shape_x,
+ const int block_shape_y, const Size2D &padding_left,
+ const Size2D &padding_right, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(
+ input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+
+ return Status{};
+}
+
+void NESpaceToBatchLayerEx::run()
+{
+ // Zero out output only if we have paddings
+ if (_has_padding)
+ {
+ NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+ }
+ NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY);
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
new file mode 100644
index 000000000..b6ae21cc0
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+void NESpaceToDepthLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
+{
+ auto k = arm_compute::support::cpp14::make_unique<NESpaceToDepthLayerKernelEx>();
+ k->configure(input, output, block_shape);
+ _kernel = std::move(k);
+}
+
+Status NESpaceToDepthLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+ int32_t block_shape)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToDepthLayerKernelEx::validate(input, output, block_shape));
+ return Status{};
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
new file mode 100644
index 000000000..fd15ef05f
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/UtilsEx.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+ : _memory_group(std::move(memory_manager)),
+ _conv_f(),
+ _upsample_f(),
+ _flip_weights(),
+ _permute_input(),
+ _permute_weights(),
+ _permute_output(),
+ _scaled_output(),
+ _weights_flipped(),
+ _permuted_input(),
+ _permuted_weights(),
+ _permuted_output(),
+ _is_nchw(false),
+ _original_weights(nullptr),
+ _input(nullptr),
+ _info(),
+ _is_prepared(false)
+{
+}
+
+Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *bias, const ITensorInfo *output,
+ const PadStrideInfo &info, unsigned int invalid_right,
+ unsigned int invalid_bottom)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16,
+ DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
+ const unsigned int width_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+ const unsigned int height_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1);
+
+ auto out_dims = transposeconv_output_dimensions(
+ input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx),
+ weights->dimension(height_idx), info, invalid_right, invalid_bottom);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ if (is_data_type_quantized_asymmetric(input->data_type()) && bias)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+ }
+ else if (bias)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ }
+
+ if (output->tensor_shape().total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) < output_shape.x(),
+ "Output's dim 0 is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) < output_shape.y(),
+ "Output's dim 1 is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) < output_shape.z(),
+ "Output's dim 2 is invalid.");
+ }
+
+ unsigned int pad_left = 0;
+ unsigned int pad_right = 0;
+ unsigned int pad_top = 0;
+ unsigned int pad_bottom = 0;
+ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+ *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
+ pad_bottom);
+ TensorInfo scale_out_info(
+ input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
+ scale_out_info.set_data_layout(input->data_layout());
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+ const unsigned int batches_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
+ const unsigned int channel_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) !=
+ scale_out_info.dimension(batches_idx));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) !=
+ scale_out_info.dimension(channel_idx));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output,
+ conv_info, WeightsInfo()));
+
+ return Status{};
+}
+
+void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias,
+ ITensor *output, const PadStrideInfo &info,
+ unsigned int invalid_right, unsigned int invalid_bottom)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ const DataLayout data_layout = input->info()->data_layout();
+
+ _input = input;
+ _original_weights = weights;
+ _info = info;
+ _is_prepared = false;
+ _is_nchw = data_layout == DataLayout::NCHW;
+
+ const unsigned int stride_x = info.stride().first;
+ const unsigned int stride_y = info.stride().second;
+
+ const unsigned int width_idx =
+ get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx =
+ get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ auto out_dims = transposeconv_output_dimensions(
+ input->info()->dimension(width_idx), input->info()->dimension(height_idx),
+ weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info,
+ invalid_right, invalid_bottom);
+
+ const TensorShape output_shape =
+ compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
+ input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
+ info, invalid_right, invalid_bottom));
+
+ _memory_group.manage(&_scaled_output);
+
+ if (!_is_nchw)
+ {
+ _memory_group.manage(&_permuted_input);
+ _memory_group.manage(&_permuted_weights);
+ _memory_group.manage(&_permuted_output);
+
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permuted_input.info()->set_quantization_info(input->info()->quantization_info());
+ _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+ // Configure the function to transform the weights tensor from NHWC -> NCHW
+ _permuted_weights.info()->set_quantization_info(weights->info()->quantization_info());
+ _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+
+ // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
+ // order to match output shape
+
+ unsigned int pad_left = 0;
+ unsigned int pad_right = 0;
+ unsigned int pad_top = 0;
+ unsigned int pad_bottom = 0;
+ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+ *_permuted_input.info(), *_permuted_weights.info(), info, out_dims, invalid_right,
+ invalid_bottom, pad_left, pad_right, pad_top, pad_bottom);
+
+ TensorInfo scale_out_info(scale_out_shape, 1, _permuted_input.info()->data_type(),
+ _permuted_input.info()->quantization_info());
+ scale_out_info.set_data_layout(DataLayout::NCHW);
+ _scaled_output.allocator()->init(scale_out_info);
+
+ const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+ DimensionRoundingType::CEIL);
+ _upsample_f.configure(&_permuted_input, &_scaled_output, upsample_info);
+
+ _weights_flipped.allocator()->init(*_permuted_weights.info()->clone());
+ _weights_flipped.info()->set_quantization_info(weights->info()->quantization_info());
+ _flip_weights.configure(&_permuted_weights, &_weights_flipped);
+
+ // setup the function to convolve the upscaled output
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+ const auto out_shape = output->info()->tensor_shape();
+ TensorShape permuted_out_shape{out_shape[1], out_shape[2], out_shape[0], out_shape[3]};
+ TensorInfo permuted_out_info(permuted_out_shape, 1, output->info()->data_type(),
+ output->info()->quantization_info());
+ _permuted_output.allocator()->init(permuted_out_info);
+ _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+ _conv_f.configure(&_scaled_output, &_weights_flipped, bias, &_permuted_output, conv_info);
+
+ // Configure the function to transform the convoluted output to NHWC
+ _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+ _permuted_input.allocator()->allocate();
+ _permuted_weights.allocator()->allocate();
+ _permuted_output.allocator()->allocate();
+ }
+ else
+ {
+ // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
+ // order to match output shape
+ unsigned int pad_left = 0;
+ unsigned int pad_right = 0;
+ unsigned int pad_top = 0;
+ unsigned int pad_bottom = 0;
+ const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+ *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+ pad_right, pad_top, pad_bottom);
+
+ TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
+ _scaled_output.allocator()->init(scale_out_info);
+ const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+ DimensionRoundingType::FLOOR);
+ _upsample_f.configure(input, &_scaled_output, upsample_info);
+
+ _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+ _flip_weights.configure(weights, &_weights_flipped);
+
+ // setup the function to convolve the upscaled output
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+ _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
+ }
+ _scaled_output.allocator()->allocate();
+}
+
+void NETransposeConvLayer::run()
+{
+ prepare();
+
+ // MemoryGroupResourceScope scope_mg(_memory_group);
+
+ // Permute input
+ if (!_is_nchw)
+ {
+ _permute_input.run();
+ }
+
+ _upsample_f.run();
+ _conv_f.run();
+
+ // Permute output
+ if (!_is_nchw)
+ {
+ _permute_output.run();
+ }
+}
+
+void NETransposeConvLayer::prepare()
+{
+ if (!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run weights flipping and mark original weights tensor as unused
+ _weights_flipped.allocator()->allocate();
+ // Permute weights
+ if (!_is_nchw)
+ {
+ _permute_weights.run();
+ }
+ NEScheduler::get().schedule(&_flip_weights, Window::DimZ);
+ _original_weights->mark_as_unused();
+
+ // Prepare convolution
+ _conv_f.prepare();
+
+ if (!_weights_flipped.is_used())
+ {
+ _weights_flipped.allocator()->free();
+ }
+
+ _is_prepared = true;
+ }
+}
+} // namespace arm_compute