diff options
Diffstat (limited to 'compiler/luci-interpreter/src/kernels/Mean.cpp')
-rw-r--r-- | compiler/luci-interpreter/src/kernels/Mean.cpp | 175 |
1 files changed, 136 insertions, 39 deletions
diff --git a/compiler/luci-interpreter/src/kernels/Mean.cpp b/compiler/luci-interpreter/src/kernels/Mean.cpp index 2394e2c0e..8e65e0d6d 100644 --- a/compiler/luci-interpreter/src/kernels/Mean.cpp +++ b/compiler/luci-interpreter/src/kernels/Mean.cpp @@ -19,7 +19,7 @@ #include "kernels/Utils.h" -#include <tensorflow/lite/kernels/internal/reference/reference_ops.h> +#include <tensorflow/lite/kernels/internal/reference/reduce.h> #include <stdexcept> @@ -28,7 +28,7 @@ namespace luci_interpreter namespace kernels { -static void resolveAxes(const int *axes_data, int num_axes, tflite::MeanParams *params) +static void resolveAxes(const int32_t *axes_data, int num_axes, tflite::MeanParams *params) { params->axis_count = num_axes; for (int i = 0; i < num_axes; ++i) @@ -42,7 +42,7 @@ static void resolveAxes(const int *axes_data, int num_axes, tflite::MeanParams * } // Returns the number of axes that will be reduced. Removes duplicates. -static int getAxisReductionCount(const int *axes_data, int num_axes, int input_num_dims) +static int getAxisReductionCount(const int32_t *axes_data, int num_axes, int input_num_dims) { int reduction_count = num_axes; for (int i = 0; i < num_axes; ++i) @@ -63,7 +63,7 @@ static int getAxisReductionCount(const int *axes_data, int num_axes, int input_n return reduction_count; } -static Shape getOutputShape(const Shape &input_shape, const int *axes_data, int num_axes, +static Shape getOutputShape(const Shape &input_shape, const int32_t *axes_data, int num_axes, bool keep_dims) { int input_num_dims = input_shape.num_dims(); @@ -123,15 +123,22 @@ static Shape getOutputShape(const Shape &input_shape, const int *axes_data, int } } -Mean::Mean(const Tensor *input, const Tensor *axes, Tensor *output, const ReducerParams ¶ms) - : KernelWithParams<ReducerParams>({input, axes}, {output}, params) +Mean::Mean(const Tensor *input, const Tensor *axes, Tensor *output, Tensor *temp_index, + Tensor *resolved_axes, Tensor *temp_sum, const ReducerParams ¶ms) + : KernelWithParams<ReducerParams>({input, axes}, {output, temp_index, resolved_axes, temp_sum}, + params) { } void Mean::configure() { - assert(input()->element_type() == output()->element_type()); - assert(axes()->element_type() == DataType::S32); + LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type()); + LUCI_INTERPRETER_CHECK(axes()->element_type() == DataType::S32); + if (input()->element_type() == DataType::S16) + { + LUCI_INTERPRETER_CHECK(input()->zero_point() == 0 && output()->zero_point() == 0); + } + const Shape &input_shape = input()->shape(); int input_num_dims = input_shape.num_dims(); @@ -144,18 +151,28 @@ void Mean::configure() tflite::MeanParams params{}; resolveAxes(axes_data, num_axes, ¶ms); - const bool need_temporaries = - !(_params.keep_dims && input_num_dims == 4 && params.axis_count == 2 && - ((params.axis[0] == 1 && params.axis[1] == 2) || - (params.axis[0] == 2 && params.axis[1] == 1))); - if (need_temporaries) - { - _temp_index = - std::make_unique<Tensor>(DataType::S32, Shape(input_num_dims), AffineQuantization{}, ""); - _resolved_axes = - std::make_unique<Tensor>(DataType::S32, Shape(num_axes), AffineQuantization{}, ""); - _temp_sum = std::make_unique<Tensor>(input()->element_type(), output()->shape(), - AffineQuantization{}, ""); + _need_temporaries = !( + _params.keep_dims && input_num_dims == 4 && params.axis_count == 2 && + ((params.axis[0] == 1 && params.axis[1] == 2) || (params.axis[0] == 2 && params.axis[1] == 1))); + if (_need_temporaries) + { + auto temp_index = getOutputTensors()[1]; + auto resolved_axes = getOutputTensors()[2]; + auto temp_sum = getOutputTensors()[3]; + + temp_index->resize(Shape(input_num_dims)); + resolved_axes->resize(Shape(num_axes)); + temp_sum->resize(output()->shape()); + } + else + { + auto temp_index = getOutputTensors()[1]; + auto resolved_axes = getOutputTensors()[2]; + auto temp_sum = getOutputTensors()[3]; + + temp_index->set_allocatable(false); + resolved_axes->set_allocatable(false); + temp_sum->set_allocatable(false); } } @@ -169,6 +186,9 @@ void Mean::execute() const case DataType::U8: evalQuantized(); break; + case DataType::S16: + evalQuantizedS16(); + break; default: throw std::runtime_error("Unsupported type."); } @@ -184,6 +204,10 @@ void Mean::evalFloat() const tflite::MeanParams params{}; resolveAxes(axes_data, num_axes, ¶ms); + auto temp_index = getOutputTensors()[1]; + auto resolved_axes = getOutputTensors()[2]; + auto temp_sum = getOutputTensors()[3]; + // Defer to specialized implementation for 4D Mean across axes 1 & 2. if (_params.keep_dims && input_num_dims == 4 && params.axis_count == 2 && ((params.axis[0] == 1 && params.axis[1] == 2) || @@ -194,12 +218,12 @@ void Mean::evalFloat() const } else { - tflite::reference_ops::Mean( - getTensorData<float>(input()), getTensorShape(input()).DimsData(), - input()->shape().num_dims(), getTensorData<float>(output()), - getTensorShape(output()).DimsData(), output()->shape().num_dims(), axes_data, num_axes, - _params.keep_dims, getTensorData<int>(_temp_index.get()), - getTensorData<int>(_resolved_axes.get()), getTensorData<float>(_temp_sum.get())); + tflite::reference_ops::Mean(getTensorData<float>(input()), getTensorShape(input()).DimsData(), + input()->shape().num_dims(), getTensorData<float>(output()), + getTensorShape(output()).DimsData(), output()->shape().num_dims(), + axes_data, num_axes, _params.keep_dims, + getTensorData<int>(temp_index), getTensorData<int>(resolved_axes), + getTensorData<float>(temp_sum)); } } @@ -213,6 +237,10 @@ void Mean::evalQuantized() const tflite::MeanParams params{}; resolveAxes(axes_data, num_axes, ¶ms); + auto temp_index = getOutputTensors()[1]; + auto resolved_axes = getOutputTensors()[2]; + auto temp_sum = getOutputTensors()[3]; + // Defer to specialized implementation for 4D Mean across axes 1 & 2. if (_params.keep_dims && input_num_dims == 4 && params.axis_count == 2 && ((params.axis[0] == 1 && params.axis[1] == 2) || @@ -225,23 +253,92 @@ void Mean::evalQuantized() const } else if (input()->zero_point() == output()->zero_point() && input()->scale() == output()->scale()) { - tflite::reference_ops::Mean( - getTensorData<uint8_t>(input()), getTensorShape(input()).DimsData(), - input()->shape().num_dims(), getTensorData<uint8_t>(output()), - getTensorShape(output()).DimsData(), output()->shape().num_dims(), axes_data, num_axes, - _params.keep_dims, getTensorData<int>(_temp_index.get()), - getTensorData<int>(_resolved_axes.get()), getTensorData<int>(_temp_sum.get())); + tflite::reference_ops::Mean(getTensorData<uint8_t>(input()), getTensorShape(input()).DimsData(), + input()->shape().num_dims(), getTensorData<uint8_t>(output()), + getTensorShape(output()).DimsData(), output()->shape().num_dims(), + axes_data, num_axes, _params.keep_dims, + getTensorData<int>(temp_index), getTensorData<int>(resolved_axes), + getTensorData<int>(temp_sum)); } else { tflite::reference_ops::QuantizedMeanOrSum<>( - getTensorData<uint8_t>(input()), input()->zero_point(), input()->scale(), - getTensorShape(input()).DimsData(), input()->shape().num_dims(), - getTensorData<uint8_t>(output()), output()->zero_point(), output()->scale(), - getTensorShape(output()).DimsData(), output()->shape().num_dims(), axes_data, num_axes, - _params.keep_dims, getTensorData<int>(_temp_index.get()), - getTensorData<int>(_resolved_axes.get()), getTensorData<int>(_temp_sum.get()), - /*compute_sum=*/false); + getTensorData<uint8_t>(input()), input()->zero_point(), input()->scale(), + getTensorShape(input()).DimsData(), input()->shape().num_dims(), + getTensorData<uint8_t>(output()), output()->zero_point(), output()->scale(), + getTensorShape(output()).DimsData(), output()->shape().num_dims(), axes_data, num_axes, + _params.keep_dims, getTensorData<int>(temp_index), getTensorData<int>(resolved_axes), + getTensorData<int>(temp_sum), + /*compute_sum=*/false); + } +} + +void Mean::evalQuantizedS16() const +{ + const auto *input_data = getTensorData<int16_t>(input()); + auto *output_data = getTensorData<int16_t>(output()); + + const Shape &input_shape = input()->shape(); + const Shape &output_shape = output()->shape(); + + const auto *axes_data = getTensorData<int32_t>(axes()); + const int num_axes = axes()->shape().num_elements(); + + constexpr int32_t output_min = -std::numeric_limits<int16_t>::max(); + constexpr int32_t output_max = std::numeric_limits<int16_t>::max(); + + // Defer to specialized implementation for 4D Mean across axes 1 & 2. + if (_params.keep_dims && input_shape.num_dims() == 4 && num_axes == 2 && + ((axes_data[0] == 1 && axes_data[1] == 2) || (axes_data[0] == 2 && axes_data[1] == 1))) + { + const int32_t batches = input_shape.dim(0); + const int32_t input_height = input_shape.dim(1); + const int32_t input_width = input_shape.dim(2); + const int32_t depth = input_shape.dim(3); + assert(output_shape.num_dims() == 4); + assert(output_shape.dim(0) == batches); + assert(output_shape.dim(1) == 1); + assert(output_shape.dim(2) == 1); + assert(output_shape.dim(3) == depth); + + const double real_multiplier = + static_cast<double>(input()->scale()) / static_cast<double>(output()->scale()); + + int32_t output_multiplier{}; + int output_shift{}; + quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift); + + const int32_t num_elements_in_axes = input_height * input_width; + + for (int32_t batch = 0; batch < batches; ++batch) + { + for (int32_t c = 0; c < depth; ++c) + { + int32_t acc = 0; + for (int32_t in_y = 0; in_y < input_height; ++in_y) + { + for (int32_t in_x = 0; in_x < input_width; ++in_x) + { + acc += input_data[calcOffset(input_shape, batch, in_y, in_x, c)]; + } + } + int32_t scaled_acc = + tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); + // Divide by the number of elements rounding to the nearest integer. + scaled_acc = scaled_acc > 0 + ? (scaled_acc + num_elements_in_axes / 2) / num_elements_in_axes + : (scaled_acc - num_elements_in_axes / 2) / num_elements_in_axes; + + scaled_acc = std::max(scaled_acc, output_min); + scaled_acc = std::min(scaled_acc, output_max); + + output_data[calcOffset(output_shape, batch, 0, 0, c)] = scaled_acc; + } + } + } + else + { + throw std::runtime_error("Unsupported configuration."); } } |