1 files changed, 477 insertions, 0 deletions
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
new file mode 100644
index 000000000..a944f699a
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+{
+  if (is_data_type_quantized_asymmetric(input.data_type()))
+  {
+    // Since we need negative offsets for computing convolution, we need to change
+    // QuantizationInfo()
+    // Extract and negate input and weights offset
+    const QuantizationInfo input_quantization_info(input.quantization_info().scale,
+                                                   -input.quantization_info().offset);
+    const QuantizationInfo weights_quantization_info(weights.quantization_info().scale,
+                                                     -weights.quantization_info().offset);
+
+    // Validate gemmlowp function
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(
+        &input.clone()->set_quantization_info(input_quantization_info),
+        &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output));
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(
+        &input, &weights, nullptr, &output, 1.f, 0.0f,
+        GEMMInfo(false, false, false /* Reshape weights only for the first run */)));
+  }
+
+  return Status{};
+}
+} // namespace
+
+NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(),
+      _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(),
+      _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(),
+      _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr),
+      _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false),
+      _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
+{
+}
+
+void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *weights,
+                                           ITensor *output)
+{
+  if (_is_quantized)
+  {
+    // Since we need negative offsets for computing convolution, we need to change
+    // QuantizationInfo()
+    // Extract and negate input and weights offset
+    const QuantizationInfo input_quantization_info = input->info()->quantization_info();
+    const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
+
+    input->info()->set_quantization_info(
+        QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
+    weights->info()->set_quantization_info(
+        QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+
+    // Configure gemmlowp function
+    _mm_gemmlowp.configure(input, weights, nullptr, output);
+
+    // Revert back QuantizatioInfo as input and weights could be used in other fully connected
+    // layers
+    input->info()->set_quantization_info(input_quantization_info);
+    weights->info()->set_quantization_info(weights_quantization_info);
+  }
+  else
+  {
+    // Configure matrix multiply kernel
+    _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f,
+                       GEMMInfo(false, false, false /* Reshape weights only for the first run */));
+  }
+}
+
+void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITensor *weights,
+                                                ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON(
+      (weights->info()->dimension(1) !=
+       (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+  // If the fully connected layer is called after a convolution layer, the input tensor must be
+  // linearized
+
+  // Initialize output tensor for flatten
+  TensorShape shape_flatten = compute_flatten_shape(input->info());
+  _flatten_output.allocator()->init(
+      input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          shape_flatten));
+
+  // Configure flatten kernel
+  _memory_group.manage(&_flatten_output);
+  _flatten_kernel.configure(input, &_flatten_output);
+
+  // Configure matrix multiply kernel
+  configure_mm(&_flatten_output, weights, output);
+
+  // Allocate the output tensor for flatten once all the configure methods have been called
+  _flatten_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayerEx::configure_fc_fc(const ITensor *input, const ITensor *weights,
+                                              ITensor *output)
+{
+  ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+  // Configure matrix multiply kernel
+  configure_mm(input, weights, output);
+}
+
+void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *weights,
+                                        const ITensor *biases, ITensor *output,
+                                        FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  // Perform validate step
+  ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate(
+      input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+      fc_info));
+
+  _are_weights_converted = true;
+  _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  _is_fc_after_conv = true;
+  _accumulate_biases = false;
+  _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+  _original_weights = weights;
+
+  // Configure gemmlowp output
+  if (_is_quantized)
+  {
+    _gemmlowp_output.allocator()->init(
+        output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+            DataType::S32));
+  }
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr && !_is_quantized)
+  {
+    _accumulate_biases = true;
+
+    // Configure accumulate biases kernel
+    _accumulate_biases_kernel.configure(output, biases);
+  }
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensor *weights_to_use = weights;
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+  if (is_batched_fc_layer)
+  {
+    _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                    input->info()->tensor_shape().cend(),
+                                    output->info()->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    _is_fc_after_conv = input->info()->num_dimensions() > 1;
+  }
+
+  // Reshape weights if needed
+  if (!_are_weights_reshaped)
+  {
+    // Reshape the weights
+    _reshape_weights_function.configure(weights, &_reshape_weights_output);
+    weights_to_use = &_reshape_weights_output;
+  }
+
+  // Convert weights if needed
+  if (_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
+  {
+    // Convert weights
+    _convert_weights.configure(weights_to_use, &_converted_weights_output,
+                               input->info()->tensor_shape(), fc_info.weights_trained_layout);
+
+    weights_to_use = &_converted_weights_output;
+    _are_weights_converted = false;
+  }
+
+  ITensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output;
+  if (_is_fc_after_conv)
+  {
+    // Fully Connected layer after a Convolution Layer without batches
+    configure_conv_fc(input, weights_to_use, tmp_output);
+  }
+  else
+  {
+    // Fully Connected layer after a Fully Connected Layer without batches
+    configure_fc_fc(input, weights_to_use, tmp_output);
+  }
+
+  // Configure output stage for asymmetric quantized types
+  if (_is_quantized)
+  {
+    float multiplier = input->info()->quantization_info().scale *
+                       weights->info()->quantization_info().scale /
+                       output->info()->quantization_info().scale;
+    int output_multiplier;
+    int output_shift;
+    quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier,
+                                                               &output_shift);
+    _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier,
+                                     output_shift, output->info()->quantization_info().offset);
+    _gemmlowp_output.allocator()->allocate();
+  }
+
+  _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+}
+
+Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                         const ITensorInfo *biases, const ITensorInfo *output,
+                                         FullyConnectedLayerInfo fc_info)
+{
+  ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+
+  bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+  bool is_fc_after_conv = true;
+  bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+
+  const ITensorInfo &flatten_input =
+      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          compute_flatten_shape(input)));
+  const ITensorInfo &reshaped_weights =
+      TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+          compute_transposed_shape(*weights)));
+  const ITensorInfo &converted_weights =
+      weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+                       : TensorInfo(*reshaped_weights.clone());
+  const ITensorInfo &gemmlowp_output = TensorInfo(
+      output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+
+  // Configure accumulate biases kernel for non quantized asymmetric types
+  if (biases != nullptr && !is_quantized)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
+  }
+
+  // With the Fully Connected layer we can have 4 different cases:
+  //  1) Convolution layer -> Fully Connected layer without batches
+  //  2) Fully Connected layer -> Fully Connected layer without batches
+  //  3) Convolution layer -> Fully Connected layer with batches
+  //  4) Fully Connected layer -> Fully Connected layer with batches
+
+  const ITensorInfo *input_to_use = input;
+  const ITensorInfo *weights_to_use = weights;
+  const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output;
+
+  // Check if we have a fully connected layer with batches
+  const bool is_batched_fc_layer = output->dimension(1) > 1;
+
+  if (is_batched_fc_layer)
+  {
+    is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                       (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(),
+                                   output->tensor_shape().cbegin() + 1));
+  }
+  else
+  {
+    is_fc_after_conv = input->num_dimensions() > 1;
+  }
+
+  if (!weights_reshaped)
+  {
+    // Validate reshape weights kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
+    weights_to_use = &reshaped_weights;
+  }
+
+  if (is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
+  {
+    // Validate convert weights kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(
+        weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout));
+    weights_to_use = &converted_weights;
+  }
+
+  if (is_fc_after_conv)
+  {
+    // Fully Connected layer after a Convolution Layer without batches
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        (weights_to_use->dimension(1) !=
+         (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+
+    // Validate flatten kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input));
+    input_to_use = &flatten_input;
+  }
+  else
+  {
+    // Fully Connected layer after a Fully Connected Layer without batches
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+  }
+  // Validate matrix multiply kernel
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output));
+
+  // Validate output stage for asymmetric quantized types
+  if (is_quantized)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(
+        &gemmlowp_output, biases, output));
+  }
+
+  return Status{};
+}
+
+void NEFullyConnectedLayerEx::run()
+{
+  if (!_is_prepared)
+  {
+    if (!_are_weights_reshaped)
+      _reshape_weights_output.allocator()->allocate();
+    if (!_are_weights_converted)
+      _converted_weights_output.allocator()->allocate();
+    _is_prepared = true;
+  }
+
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    // Reshape of the weights
+    if (!_are_weights_reshaped)
+    {
+      _reshape_weights_function.run();
+    }
+
+    // Convert weights if needed
+    if (!_are_weights_converted)
+    {
+      _convert_weights.run();
+    }
+
+    // Prepare GEMM prepare
+    if (!_is_quantized)
+    {
+      _mm_gemm.prepare();
+    }
+  }
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  // Linearize input if it comes from a convolutional layer
+  if (_is_fc_after_conv)
+  {
+    NEScheduler::get().schedule(&_flatten_kernel, Window::DimY);
+  }
+
+  // Run matrix multiply
+  if (_is_quantized)
+  {
+    _mm_gemmlowp.run();
+  }
+  else
+  {
+    _mm_gemm.run();
+  }
+
+  // Accumulate biases if provided
+  if (_is_quantized)
+  {
+    _gemmlowp_output_stage.run();
+  }
+  else
+  {
+    if (_accumulate_biases)
+    {
+      NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+    }
+  }
+}
+
+void NEFullyConnectedLayerEx::prepare()
+{
+#if 0 // TODO Remove this block
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    auto release_unused = [](Tensor *w) {
+      if (!w->is_used())
+      {
+        w->allocator()->free();
+      }
+    };
+
+    // Pointer to current weights
+    const ITensor *cur_weights = _original_weights;
+
+    // Reshape of the weights (happens only once)
+    if (!_are_weights_reshaped)
+    {
+      // Run reshape weights kernel and mark weights as unused
+      _reshape_weights_output.allocator()->allocate();
+      _reshape_weights_function.run();
+
+      cur_weights->mark_as_unused();
+      cur_weights = &_reshape_weights_output;
+      _are_weights_reshaped = true;
+    }
+
+    // Convert weights if needed (happens only once)
+    if (!_are_weights_converted)
+    {
+      _converted_weights_output.allocator()->allocate();
+      _convert_weights.run();
+
+      cur_weights->mark_as_unused();
+      _are_weights_converted = true;
+    }
+
+    // Release reshaped weights if unused
+    release_unused(&_reshape_weights_output);
+
+    // Prepare GEMM prepare and release unused weights
+    if (!_is_quantized)
+    {
+      _mm_gemm.prepare();
+    }
+
+    // Release converted weights if unused
+    release_unused(&_reshape_weights_output);
+    release_unused(&_converted_weights_output);
+
+    _is_prepared = true;
+  }
+#endif
+}