1 files changed, 92 insertions, 64 deletions
diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
index 05da33abf..32cad84cb 100644
--- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
+++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
@@ -31,9 +31,9 @@ namespace ops
 {
 
 FullyConnectedLayer::FullyConnectedLayer()
-    : _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr),
-      _activation(ir::Activation::NONE), _temp_arena(new nnfw::cker::FCTempArena()),
-      _external_context(nullptr), _is_hybrid(false)
+  : _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr),
+    _activation(ir::Activation::NONE), _temp_arena(new nnfw::cker::FCTempArena()),
+    _external_context(nullptr), _is_hybrid(false), _is_shuffled16x1float32(false)
 {
   // DO NOTHING
 }
@@ -42,19 +42,22 @@ FullyConnectedLayer::~FullyConnectedLayer() = default;
 
 void FullyConnectedLayer::fullyConnectedFloat32()
 {
-  float output_activation_min = 0, output_activation_max = 0;
+  nnfw::cker::FullyConnectedParams op_params;
+  float output_activation_min = 0;
+  float output_activation_max = 0;
   CalculateActivationRange(_activation, &output_activation_min, &output_activation_max);
 
-  nnfw::cker::FullyConnectedParams op_params;
+  op_params.activation = convertActivationType(_activation);
   op_params.float_activation_min = output_activation_min;
   op_params.float_activation_max = output_activation_max;
-  op_params.activation = convertActivationType(_activation);
-
-  nnfw::cker::FullyConnected(
-      op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-      getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()),
-      getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
-      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+  // TODO Set both cachables as false when training
+  op_params.lhs_cacheable = _weights->is_constant();
+  op_params.rhs_cacheable = _input->is_constant();
+
+  nnfw::cker::FullyConnected(op_params, getShape(_input), getBuffer<float>(_input),
+                             getShape(_weights), getBuffer<float>(_weights), getShape(_bias),
+                             _bias ? getBuffer<float>(_bias) : nullptr, getShape(_output),
+                             getBuffer<float>(_output));
 }
 
 // executionMutex is used to protect concurrent access of non-threadsafe resources
@@ -68,23 +71,22 @@ void FullyConnectedLayer::fullyConnectedQuant8()
   int32_t output_activation_max = 0;
   GetQuantizedConvolutionMultiplier(_input, _weights, _bias, _output, &real_multiplier);
   QuantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
-  CalculateActivationRangeUint8(_activation, _output, &output_activation_min,
-                                &output_activation_max);
+  CalculateActivationRangeQuantized(_activation, _output, &output_activation_min,
+                                    &output_activation_max);
 
   nnfw::cker::FullyConnectedParams op_params;
-  op_params.input_offset = -_input->data_offset();
-  op_params.weights_offset = -_weights->data_offset();
-  op_params.output_offset = _output->data_offset();
+  op_params.input_offset = -_input->data_zero_point();
+  op_params.weights_offset = -_weights->data_zero_point();
+  op_params.output_offset = _output->data_zero_point();
   op_params.output_multiplier = output_multiplier;
   op_params.output_shift = output_shift;
   op_params.quantized_activation_min = output_activation_min;
   op_params.quantized_activation_max = output_activation_max;
 
-  nnfw::cker::FullyConnected(
-      op_params, getTensorShape(_input), reinterpret_cast<const uint8_t *>(_input->buffer()),
-      getTensorShape(_weights), reinterpret_cast<const uint8_t *>(_weights->buffer()),
-      getTensorShape(_bias), reinterpret_cast<const int32_t *>(_bias ? _bias->buffer() : nullptr),
-      getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer()));
+  nnfw::cker::FullyConnected(op_params, getShape(_input), getBuffer<uint8_t>(_input),
+                             getShape(_weights), getBuffer<uint8_t>(_weights), getShape(_bias),
+                             _bias ? getBuffer<int32_t>(_bias) : nullptr, getShape(_output),
+                             getBuffer<uint8_t>(_output));
 }
 
 void FullyConnectedLayer::fullyConnectedHybrid()
@@ -92,7 +94,7 @@ void FullyConnectedLayer::fullyConnectedHybrid()
   nnfw::cker::FCTempArena &temp_arena = *_temp_arena;
   if (!temp_arena.prepared)
   {
-    temp_arena.prepare(getTensorShape(_input), getTensorShape(_weights));
+    temp_arena.prepare(getShape(_input), getShape(_weights));
   }
 
   nnfw::cker::FullyConnectedParams op_params;
@@ -101,20 +103,16 @@ void FullyConnectedLayer::fullyConnectedHybrid()
 
 #ifndef USE_RUY_GEMV
   nnfw::cker::FullyConnectedHybrid(
-      op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-      getTensorShape(_weights), reinterpret_cast<const int8_t *>(_weights->buffer()),
-      getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
-      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena,
-      _external_context->ruy_context());
+    op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
+    getBuffer<int8_t>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
+    getShape(_output), getBuffer<float>(_output), temp_arena, _external_context->ruy_context());
 #else
   nnfw::cker::FullyConnectedHybrid(
-      op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-      getTensorShape(_weights),
-      (_cached_weights) ? reinterpret_cast<const int8_t *>(_cached_weights)
-                        : reinterpret_cast<const int8_t *>(_weights->buffer()),
-      getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
-      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena,
-      _external_context->ruy_context());
+    op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
+    (_cached_weights) ? reinterpret_cast<const int8_t *>(_cached_weights)
+                      : getBuffer<int8_t>(_weights),
+    getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr, getShape(_output),
+    getBuffer<float>(_output), temp_arena, _external_context->ruy_context());
 
   if (_cached_weights == nullptr || _is_weights_freed)
     return;
@@ -125,8 +123,8 @@ void FullyConnectedLayer::fullyConnectedHybrid()
 
   // if input's elements are filled with zero, it by-passes(does not enter ruy-kernel path)
   // so that handle this case
-  const int input_size = getTensorShape(_input).FlatSize();
-  if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_input->buffer()), input_size))
+  const int input_size = getShape(_input).FlatSize();
+  if (nnfw::cker::IsZeroVector(getBuffer<float>(_input), input_size))
     return;
 
   auto weight_tensor = nnfw::misc::polymorphic_downcast<const Tensor *>(_weights);
@@ -143,6 +141,10 @@ void FullyConnectedLayer::fullyConnectedHybrid()
   tensor->decrease_ref();
   if (tensor->buffer() == nullptr) // ref == 0?
   {
+#if defined(__ANDROID__) && (__ANDROID_API__ >= 26)
+    // NOTE This line forces OS to release any unused memory immediately
+    mallopt(M_PURGE, 0);
+#endif
     _is_weights_freed = true;
   }
 #endif
@@ -150,28 +152,52 @@ void FullyConnectedLayer::fullyConnectedHybrid()
 
 void FullyConnectedLayer::fullyConnectedSparseWeight()
 {
+  nnfw::cker::FullyConnectedParams op_params;
+  op_params.activation = convertActivationType(_activation);
+
+  const uint16_t *w1_segments = _weights->sparsity()->w1_segments();
+  const uint16_t *w1_indices = _weights->sparsity()->w1_indices();
+
+  auto block_size = _weights->sparsity()->block_size();
+  if (block_size.size() == 0)
+  {
+    nnfw::cker::FullyConnectedSparseWeightRandom(
+      op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
+      getBuffer<float>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
+      getShape(_output), getBuffer<float>(_output), w1_segments, w1_indices);
+  }
+  else if (block_size.size() == 2 && block_size[0] == 16 && block_size[1] == 1)
+  {
+    nnfw::cker::FullyConnectedSparseWeight16x1(
+      op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
+      getBuffer<float>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
+      getShape(_output), getBuffer<float>(_output), w1_segments, w1_indices);
+  }
+  else
+    throw std::runtime_error{"FullyConnected: unsupported sparsity"};
+}
+
+void FullyConnectedLayer::fullyConnected16x1Float32()
+{
+#if defined(__aarch64__) && defined(USE_NEON)
   float output_activation_min = 0, output_activation_max = 0;
   CalculateActivationRange(_activation, &output_activation_min, &output_activation_max);
 
   nnfw::cker::FullyConnectedParams op_params;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
   op_params.activation = convertActivationType(_activation);
 
-  int w0_size = getTensorShape(_weights).Dims(0);
-  const uint16_t *w1_segments = _weights->w1_segments();
-  const uint16_t *w1_indices = _weights->w1_indices();
-
-  nnfw::cker::FullyConnectedSparseWeight(
-      op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-      getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()),
-      getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
-      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), w0_size, w1_segments,
-      w1_indices);
+  nnfw::cker::FullyConnected16x1Float32(op_params, getShape(_input), getBuffer<float>(_input),
+                                        getShape(_weights), getBuffer<float>(_weights),
+                                        getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
+                                        getShape(_output), getBuffer<float>(_output));
+#else
+  throw std::runtime_error{"FullyConnected: Shuffled16x1Float32 weights_format is not supported."};
+#endif
 }
 
 void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortableTensor *weights,
                                     const IPortableTensor *bias, ir::Activation activation,
+                                    ir::FullyConnectedWeightsFormat weights_format,
                                     IPortableTensor *output,
                                     const std::shared_ptr<ExternalContext> &external_context)
 {
@@ -182,6 +208,14 @@ void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortabl
   _output = output;
   _is_hybrid = input->data_type() == OperandType::FLOAT32 &&
                weights->data_type() == OperandType::QUANT_INT8_SYMM;
+  _is_shuffled16x1float32 = weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32;
+#if !defined(__aarch64__) || !defined(USE_NEON)
+  if (_is_shuffled16x1float32)
+  {
+    throw std::runtime_error{
+      "FullyConnected: Shuffled16x1Float32 weights_format is not supported."};
+  }
+#endif
   _external_context = external_context;
 }
 
@@ -191,13 +225,13 @@ void FullyConnectedLayer::run()
   {
     fullyConnectedHybrid();
   }
-  else if (_weights->is_sparse())
+  else if (_weights->sparsity())
   {
     fullyConnectedSparseWeight();
   }
   else if (_input->data_type() == OperandType::FLOAT32)
   {
-    fullyConnectedFloat32();
+    _is_shuffled16x1float32 ? fullyConnected16x1Float32() : fullyConnectedFloat32();
   }
   else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
   {
@@ -213,8 +247,8 @@ void FullyConnectedLayer::prepare()
 {
   if (_bias && _bias->is_constant())
   {
-    const int bias_size = getTensorShape(_bias).FlatSize();
-    if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_bias->buffer()), bias_size))
+    const int bias_size = getShape(_bias).FlatSize();
+    if (nnfw::cker::IsZeroVector(getBuffer<float>(_bias), bias_size))
     {
       _bias = nullptr;
     }
@@ -236,20 +270,14 @@ void FullyConnectedLayer::prepare()
   if (_input->is_dynamic() || !_weights->is_constant())
     return;
 
-  const int rows = getTensorShape(_weights).Dims(0);
+  const int rows = getShape(_weights).Dims(0);
   if (rows % 4 == 0)
   {
-    const int total_input_size = getTensorShape(_input).FlatSize();
-    const int input_size = getTensorShape(_weights).Dims(1);
-    const int batch_size = total_input_size / input_size;
-    if (batch_size <= 4)
-    {
-      // TODO If it's possible to extract precaching from ruy kernel,
-      // place this instead of below code
+    // TODO If it's possible to extract precaching from ruy kernel,
+    // place this instead of below code
 
-      // buffer will be used by ruy kernel as a cache key
-      _cached_weights = _weights->buffer();
-    }
+    // buffer will be used by ruy kernel as a cache key
+    _cached_weights = _weights->buffer();
   }
 #endif
 }