diff options
Diffstat (limited to 'runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc')
-rw-r--r-- | runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc | 156 |
1 files changed, 92 insertions, 64 deletions
diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc index 05da33abf..32cad84cb 100644 --- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc +++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc @@ -31,9 +31,9 @@ namespace ops { FullyConnectedLayer::FullyConnectedLayer() - : _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), - _activation(ir::Activation::NONE), _temp_arena(new nnfw::cker::FCTempArena()), - _external_context(nullptr), _is_hybrid(false) + : _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), + _activation(ir::Activation::NONE), _temp_arena(new nnfw::cker::FCTempArena()), + _external_context(nullptr), _is_hybrid(false), _is_shuffled16x1float32(false) { // DO NOTHING } @@ -42,19 +42,22 @@ FullyConnectedLayer::~FullyConnectedLayer() = default; void FullyConnectedLayer::fullyConnectedFloat32() { - float output_activation_min = 0, output_activation_max = 0; + nnfw::cker::FullyConnectedParams op_params; + float output_activation_min = 0; + float output_activation_max = 0; CalculateActivationRange(_activation, &output_activation_min, &output_activation_max); - nnfw::cker::FullyConnectedParams op_params; + op_params.activation = convertActivationType(_activation); op_params.float_activation_min = output_activation_min; op_params.float_activation_max = output_activation_max; - op_params.activation = convertActivationType(_activation); - - nnfw::cker::FullyConnected( - op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()), - getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()), - getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr), - getTensorShape(_output), reinterpret_cast<float *>(_output->buffer())); + // TODO Set both cachables as false when training + op_params.lhs_cacheable = _weights->is_constant(); + op_params.rhs_cacheable = _input->is_constant(); + + nnfw::cker::FullyConnected(op_params, getShape(_input), getBuffer<float>(_input), + getShape(_weights), getBuffer<float>(_weights), getShape(_bias), + _bias ? getBuffer<float>(_bias) : nullptr, getShape(_output), + getBuffer<float>(_output)); } // executionMutex is used to protect concurrent access of non-threadsafe resources @@ -68,23 +71,22 @@ void FullyConnectedLayer::fullyConnectedQuant8() int32_t output_activation_max = 0; GetQuantizedConvolutionMultiplier(_input, _weights, _bias, _output, &real_multiplier); QuantizeMultiplier(real_multiplier, &output_multiplier, &output_shift); - CalculateActivationRangeUint8(_activation, _output, &output_activation_min, - &output_activation_max); + CalculateActivationRangeQuantized(_activation, _output, &output_activation_min, + &output_activation_max); nnfw::cker::FullyConnectedParams op_params; - op_params.input_offset = -_input->data_offset(); - op_params.weights_offset = -_weights->data_offset(); - op_params.output_offset = _output->data_offset(); + op_params.input_offset = -_input->data_zero_point(); + op_params.weights_offset = -_weights->data_zero_point(); + op_params.output_offset = _output->data_zero_point(); op_params.output_multiplier = output_multiplier; op_params.output_shift = output_shift; op_params.quantized_activation_min = output_activation_min; op_params.quantized_activation_max = output_activation_max; - nnfw::cker::FullyConnected( - op_params, getTensorShape(_input), reinterpret_cast<const uint8_t *>(_input->buffer()), - getTensorShape(_weights), reinterpret_cast<const uint8_t *>(_weights->buffer()), - getTensorShape(_bias), reinterpret_cast<const int32_t *>(_bias ? _bias->buffer() : nullptr), - getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer())); + nnfw::cker::FullyConnected(op_params, getShape(_input), getBuffer<uint8_t>(_input), + getShape(_weights), getBuffer<uint8_t>(_weights), getShape(_bias), + _bias ? getBuffer<int32_t>(_bias) : nullptr, getShape(_output), + getBuffer<uint8_t>(_output)); } void FullyConnectedLayer::fullyConnectedHybrid() @@ -92,7 +94,7 @@ void FullyConnectedLayer::fullyConnectedHybrid() nnfw::cker::FCTempArena &temp_arena = *_temp_arena; if (!temp_arena.prepared) { - temp_arena.prepare(getTensorShape(_input), getTensorShape(_weights)); + temp_arena.prepare(getShape(_input), getShape(_weights)); } nnfw::cker::FullyConnectedParams op_params; @@ -101,20 +103,16 @@ void FullyConnectedLayer::fullyConnectedHybrid() #ifndef USE_RUY_GEMV nnfw::cker::FullyConnectedHybrid( - op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()), - getTensorShape(_weights), reinterpret_cast<const int8_t *>(_weights->buffer()), - getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr), - getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena, - _external_context->ruy_context()); + op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights), + getBuffer<int8_t>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr, + getShape(_output), getBuffer<float>(_output), temp_arena, _external_context->ruy_context()); #else nnfw::cker::FullyConnectedHybrid( - op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()), - getTensorShape(_weights), - (_cached_weights) ? reinterpret_cast<const int8_t *>(_cached_weights) - : reinterpret_cast<const int8_t *>(_weights->buffer()), - getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr), - getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena, - _external_context->ruy_context()); + op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights), + (_cached_weights) ? reinterpret_cast<const int8_t *>(_cached_weights) + : getBuffer<int8_t>(_weights), + getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr, getShape(_output), + getBuffer<float>(_output), temp_arena, _external_context->ruy_context()); if (_cached_weights == nullptr || _is_weights_freed) return; @@ -125,8 +123,8 @@ void FullyConnectedLayer::fullyConnectedHybrid() // if input's elements are filled with zero, it by-passes(does not enter ruy-kernel path) // so that handle this case - const int input_size = getTensorShape(_input).FlatSize(); - if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_input->buffer()), input_size)) + const int input_size = getShape(_input).FlatSize(); + if (nnfw::cker::IsZeroVector(getBuffer<float>(_input), input_size)) return; auto weight_tensor = nnfw::misc::polymorphic_downcast<const Tensor *>(_weights); @@ -143,6 +141,10 @@ void FullyConnectedLayer::fullyConnectedHybrid() tensor->decrease_ref(); if (tensor->buffer() == nullptr) // ref == 0? { +#if defined(__ANDROID__) && (__ANDROID_API__ >= 26) + // NOTE This line forces OS to release any unused memory immediately + mallopt(M_PURGE, 0); +#endif _is_weights_freed = true; } #endif @@ -150,28 +152,52 @@ void FullyConnectedLayer::fullyConnectedHybrid() void FullyConnectedLayer::fullyConnectedSparseWeight() { + nnfw::cker::FullyConnectedParams op_params; + op_params.activation = convertActivationType(_activation); + + const uint16_t *w1_segments = _weights->sparsity()->w1_segments(); + const uint16_t *w1_indices = _weights->sparsity()->w1_indices(); + + auto block_size = _weights->sparsity()->block_size(); + if (block_size.size() == 0) + { + nnfw::cker::FullyConnectedSparseWeightRandom( + op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights), + getBuffer<float>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr, + getShape(_output), getBuffer<float>(_output), w1_segments, w1_indices); + } + else if (block_size.size() == 2 && block_size[0] == 16 && block_size[1] == 1) + { + nnfw::cker::FullyConnectedSparseWeight16x1( + op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights), + getBuffer<float>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr, + getShape(_output), getBuffer<float>(_output), w1_segments, w1_indices); + } + else + throw std::runtime_error{"FullyConnected: unsupported sparsity"}; +} + +void FullyConnectedLayer::fullyConnected16x1Float32() +{ +#if defined(__aarch64__) && defined(USE_NEON) float output_activation_min = 0, output_activation_max = 0; CalculateActivationRange(_activation, &output_activation_min, &output_activation_max); nnfw::cker::FullyConnectedParams op_params; - op_params.float_activation_min = output_activation_min; - op_params.float_activation_max = output_activation_max; op_params.activation = convertActivationType(_activation); - int w0_size = getTensorShape(_weights).Dims(0); - const uint16_t *w1_segments = _weights->w1_segments(); - const uint16_t *w1_indices = _weights->w1_indices(); - - nnfw::cker::FullyConnectedSparseWeight( - op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()), - getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()), - getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr), - getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), w0_size, w1_segments, - w1_indices); + nnfw::cker::FullyConnected16x1Float32(op_params, getShape(_input), getBuffer<float>(_input), + getShape(_weights), getBuffer<float>(_weights), + getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr, + getShape(_output), getBuffer<float>(_output)); +#else + throw std::runtime_error{"FullyConnected: Shuffled16x1Float32 weights_format is not supported."}; +#endif } void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortableTensor *weights, const IPortableTensor *bias, ir::Activation activation, + ir::FullyConnectedWeightsFormat weights_format, IPortableTensor *output, const std::shared_ptr<ExternalContext> &external_context) { @@ -182,6 +208,14 @@ void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortabl _output = output; _is_hybrid = input->data_type() == OperandType::FLOAT32 && weights->data_type() == OperandType::QUANT_INT8_SYMM; + _is_shuffled16x1float32 = weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32; +#if !defined(__aarch64__) || !defined(USE_NEON) + if (_is_shuffled16x1float32) + { + throw std::runtime_error{ + "FullyConnected: Shuffled16x1Float32 weights_format is not supported."}; + } +#endif _external_context = external_context; } @@ -191,13 +225,13 @@ void FullyConnectedLayer::run() { fullyConnectedHybrid(); } - else if (_weights->is_sparse()) + else if (_weights->sparsity()) { fullyConnectedSparseWeight(); } else if (_input->data_type() == OperandType::FLOAT32) { - fullyConnectedFloat32(); + _is_shuffled16x1float32 ? fullyConnected16x1Float32() : fullyConnectedFloat32(); } else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM) { @@ -213,8 +247,8 @@ void FullyConnectedLayer::prepare() { if (_bias && _bias->is_constant()) { - const int bias_size = getTensorShape(_bias).FlatSize(); - if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_bias->buffer()), bias_size)) + const int bias_size = getShape(_bias).FlatSize(); + if (nnfw::cker::IsZeroVector(getBuffer<float>(_bias), bias_size)) { _bias = nullptr; } @@ -236,20 +270,14 @@ void FullyConnectedLayer::prepare() if (_input->is_dynamic() || !_weights->is_constant()) return; - const int rows = getTensorShape(_weights).Dims(0); + const int rows = getShape(_weights).Dims(0); if (rows % 4 == 0) { - const int total_input_size = getTensorShape(_input).FlatSize(); - const int input_size = getTensorShape(_weights).Dims(1); - const int batch_size = total_input_size / input_size; - if (batch_size <= 4) - { - // TODO If it's possible to extract precaching from ruy kernel, - // place this instead of below code + // TODO If it's possible to extract precaching from ruy kernel, + // place this instead of below code - // buffer will be used by ruy kernel as a cache key - _cached_weights = _weights->buffer(); - } + // buffer will be used by ruy kernel as a cache key + _cached_weights = _weights->buffer(); } #endif } |