diff options
Diffstat (limited to 'runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc')
-rw-r--r-- | runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc | 85 |
1 files changed, 69 insertions, 16 deletions
diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc index c00be64e5..05da33abf 100644 --- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc +++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc @@ -18,6 +18,8 @@ #include "../Tensor.h" #include <cker/operation/FullyConnected.h> +#include <cker/TensorUtils.h> +#include <misc/polymorphic_downcast.h> namespace onert { @@ -31,7 +33,7 @@ namespace ops FullyConnectedLayer::FullyConnectedLayer() : _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), _activation(ir::Activation::NONE), _temp_arena(new nnfw::cker::FCTempArena()), - _is_hybrid(false) + _external_context(nullptr), _is_hybrid(false) { // DO NOTHING } @@ -102,7 +104,8 @@ void FullyConnectedLayer::fullyConnectedHybrid() op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()), getTensorShape(_weights), reinterpret_cast<const int8_t *>(_weights->buffer()), getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr), - getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena); + getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena, + _external_context->ruy_context()); #else nnfw::cker::FullyConnectedHybrid( op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()), @@ -110,31 +113,67 @@ void FullyConnectedLayer::fullyConnectedHybrid() (_cached_weights) ? reinterpret_cast<const int8_t *>(_cached_weights) : reinterpret_cast<const int8_t *>(_weights->buffer()), getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr), - getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena); + getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena, + _external_context->ruy_context()); -// TODO Enable calling decrease_ref -#if 0 if (_cached_weights == nullptr || _is_weights_freed) return; - auto weight_tensor = dynamic_cast<const Tensor *>(_weights); - if (weight_tensor) + // '_cached_weights is not nullptr and _is_weights_freed is false' means + // this weight shape is satisfied with the ruy kernel's prepack cache's condition. + // After entering here, it will not enter again except below the case - input is zero-vector + + // if input's elements are filled with zero, it by-passes(does not enter ruy-kernel path) + // so that handle this case + const int input_size = getTensorShape(_input).FlatSize(); + if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_input->buffer()), input_size)) + return; + + auto weight_tensor = nnfw::misc::polymorphic_downcast<const Tensor *>(_weights); + + // This weight tensor could be other ops' const tensor. + // Therefore, below reference should be checked like following + auto tensor = const_cast<Tensor *>(weight_tensor); + if (tensor->buffer() == nullptr) // ref is already 0? { - auto tensor = const_cast<Tensor *>(weight_tensor); + _is_weights_freed = true; + return; + } - tensor->decrease_ref(); - if (tensor->buffer() == nullptr) // ref == 0? - { - _is_weights_freed = true; - } + tensor->decrease_ref(); + if (tensor->buffer() == nullptr) // ref == 0? + { + _is_weights_freed = true; } -#endif // if 0 #endif } +void FullyConnectedLayer::fullyConnectedSparseWeight() +{ + float output_activation_min = 0, output_activation_max = 0; + CalculateActivationRange(_activation, &output_activation_min, &output_activation_max); + + nnfw::cker::FullyConnectedParams op_params; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + op_params.activation = convertActivationType(_activation); + + int w0_size = getTensorShape(_weights).Dims(0); + const uint16_t *w1_segments = _weights->w1_segments(); + const uint16_t *w1_indices = _weights->w1_indices(); + + nnfw::cker::FullyConnectedSparseWeight( + op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()), + getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()), + getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr), + getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), w0_size, w1_segments, + w1_indices); +} + void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortableTensor *weights, const IPortableTensor *bias, ir::Activation activation, - IPortableTensor *output) + IPortableTensor *output, + const std::shared_ptr<ExternalContext> &external_context) { _input = input; _weights = weights; @@ -143,6 +182,7 @@ void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortabl _output = output; _is_hybrid = input->data_type() == OperandType::FLOAT32 && weights->data_type() == OperandType::QUANT_INT8_SYMM; + _external_context = external_context; } void FullyConnectedLayer::run() @@ -151,6 +191,10 @@ void FullyConnectedLayer::run() { fullyConnectedHybrid(); } + else if (_weights->is_sparse()) + { + fullyConnectedSparseWeight(); + } else if (_input->data_type() == OperandType::FLOAT32) { fullyConnectedFloat32(); @@ -167,7 +211,16 @@ void FullyConnectedLayer::run() void FullyConnectedLayer::prepare() { -#ifdef USE_RUY_GEMV + if (_bias && _bias->is_constant()) + { + const int bias_size = getTensorShape(_bias).FlatSize(); + if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_bias->buffer()), bias_size)) + { + _bias = nullptr; + } + } + +#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && defined(USE_RUY_GEMV) // TODO This is workaround // The only fc hybrid will use ruy kernel if (_input->data_type() != OperandType::FLOAT32 || |