diff options
Diffstat (limited to 'compute/cker/include/cker/operation/FullyConnected.h')
-rw-r--r-- | compute/cker/include/cker/operation/FullyConnected.h | 138 |
1 files changed, 90 insertions, 48 deletions
diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h index 428fb1b53..01b925efb 100644 --- a/compute/cker/include/cker/operation/FullyConnected.h +++ b/compute/cker/include/cker/operation/FullyConnected.h @@ -19,69 +19,66 @@ #define __NNFW_CKER_FULLY_CONNECTED_H__ #include "cker/Shape.h" +#include "cker/Types.h" #include "cker/Utils.h" +#include "cker/TensorUtils.h" namespace nnfw { namespace cker { -struct FullyConnectedParams +class FCTempArena { - // uint8 inference params. - // TODO(b/65838351): Use smaller types if appropriate. - int32_t input_offset; - int32_t weights_offset; - int32_t output_offset; - int32_t output_multiplier; - int output_shift; - // uint8, etc, activation params. - int32_t quantized_activation_min; - int32_t quantized_activation_max; - // float activation params. - float float_activation_min; - float float_activation_max; - // FullyConnectedWeightsFormat weights_format; +public: + FCTempArena(void) : prepared(false), input_quantized(), scaling_factors() + { + // DO NOTHING + } + + void prepare(const Shape &input_shape, const Shape &weights_shape) + { + auto input_size = input_shape.FlatSize(); + input_quantized.resize(input_size); + + assert(weights_shape.DimensionsCount() == 2); + int batch_size = input_size / weights_shape.Dims(1); + scaling_factors.resize(batch_size); + prepared = true; + } + +public: + bool prepared; + std::vector<int8_t> input_quantized; + std::vector<float> scaling_factors; }; inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape, const float *input_data, const Shape &weights_shape, - const float *weights_data, const Shape &bias_shape, - const float *bias_data, const Shape &output_shape, float *output_data) + const float *weights_data, const Shape &, const float *bias_data, + const Shape &, float *output_data) { - UNUSED_RELEASE(input_shape); - UNUSED_RELEASE(bias_shape); - const float output_activation_min = params.float_activation_min; - const float output_activation_max = params.float_activation_max; - // TODO(benoitjacob): This really should be: - // const int batches = ArraySize(output_dims, 1); - // but the current --variable_batch hack consists in overwriting the 3rd - // dimension with the runtime batch size, as we don't keep track for each - // array of which dimension is the batch dimension in it. - const int output_dims_count = output_shape.DimensionsCount(); - const int weights_dims_count = weights_shape.DimensionsCount(); - const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1); - const int output_depth = - MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1); - const int accum_depth = weights_shape.Dims(weights_dims_count - 1); - for (int b = 0; b < batches; ++b) + int total_input_size = input_shape.FlatSize(); + int input_size = weights_shape.Dims(1); + const int batch_size = total_input_size / input_size; + const int num_units = weights_shape.Dims(0); + + // Output = bias if bias tensor exists. + if (bias_data) { - for (int out_c = 0; out_c < output_depth; ++out_c) - { - float total = 0.f; - for (int d = 0; d < accum_depth; ++d) - { - total += input_data[b * accum_depth + d] * weights_data[out_c * accum_depth + d]; - } - float bias_value = 0.0f; - if (bias_data) - { - bias_value = bias_data[out_c]; - } - output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax( - total + bias_value, output_activation_min, output_activation_max); - } + VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data); + } + else + { + ZeroVector(output_data, batch_size * num_units); } + + // Compute output += weight * input + MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size, + output_data, /*result_stride=*/1); + + // Apply activation function + ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); } inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape, @@ -138,6 +135,51 @@ inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &inpu } } +inline void FullyConnectedHybrid(const FullyConnectedParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &filter_shape, + const int8_t *filter_data, const Shape &, const float *bias_data, + const Shape &, float *output_data, FCTempArena &temp_arena) +{ + int total_input_size = input_shape.FlatSize(); + const int input_size = filter_shape.Dims(1); + const int batch_size = total_input_size / input_size; + const int num_units = filter_shape.Dims(0); + + // Output = bias if bias tensor exists. + VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data); + + // Save matrix multiplication computation for all zero input. + if (IsZeroVector(input_data, total_input_size)) + { + ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); + return; + } + + // Quantize input from float to uint8 + quantization params (scaling factor). + float unused_min, unused_max; + float *scaling_factors_ptr = temp_arena.scaling_factors.data(); + int8_t *quant_data = temp_arena.input_quantized.data(); + + // Quantize each batch independently. + for (int b = 0; b < batch_size; ++b) + { + const int offset = b * input_size; + SymmetricQuantizeFloats(input_data + offset, input_size, quant_data + offset, &unused_min, + &unused_max, &scaling_factors_ptr[b]); + // Incorporate scaling of the filter. + scaling_factors_ptr[b] *= params.weights_scale; + } + + // Compute output += weight * quantized_input + MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data, + scaling_factors_ptr, batch_size, output_data, + /*result_stride=*/1); + + // Apply activation function to floats. + ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); + return; +} + } // namespace cker } // namespace nnfw |