diff options
author | Chunseok Lee <chunseok.lee@samsung.com> | 2022-04-15 19:15:11 +0900 |
---|---|---|
committer | Chunseok Lee <chunseok.lee@samsung.com> | 2022-04-15 19:15:11 +0900 |
commit | 3ad689f0803519e343c36d5700646e86059df961 (patch) | |
tree | 862346c401a5577518fa7f042532aa931b53aa0e /compiler/luci-interpreter/pal | |
parent | ac6e4dd7b480e83b586ef533d7b29a8a97eb48fe (diff) | |
download | nnfw-3ad689f0803519e343c36d5700646e86059df961.tar.gz nnfw-3ad689f0803519e343c36d5700646e86059df961.tar.bz2 nnfw-3ad689f0803519e343c36d5700646e86059df961.zip |
Imported Upstream version 1.20.0upstream/1.20.0submit/tizen/20220415.103159
Diffstat (limited to 'compiler/luci-interpreter/pal')
32 files changed, 2099 insertions, 75 deletions
diff --git a/compiler/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst b/compiler/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst index 771974afe..d134a6b95 100644 --- a/compiler/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst +++ b/compiler/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst @@ -7,9 +7,11 @@ REGISTER_KERNEL(Concatenation) REGISTER_KERNEL(Conv2D) REGISTER_KERNEL(DepthToSpace) REGISTER_KERNEL(DepthwiseConv2D) +REGISTER_KERNEL(Dequantize) REGISTER_KERNEL(Div) REGISTER_KERNEL(Elu) REGISTER_KERNEL(Exp) +REGISTER_KERNEL(ExpandDims) REGISTER_KERNEL(Floor) REGISTER_KERNEL(FloorDiv) REGISTER_KERNEL(Equal) @@ -37,6 +39,7 @@ REGISTER_KERNEL(NotEqual) REGISTER_KERNEL(Pad) REGISTER_KERNEL(PadV2) REGISTER_KERNEL(PRelu) +REGISTER_KERNEL(Quantize) REGISTER_KERNEL(Reshape) REGISTER_KERNEL(ResizeBilinear) REGISTER_KERNEL(ResizeNearestNeighbor) @@ -50,6 +53,7 @@ REGISTER_KERNEL(Square) REGISTER_KERNEL(SquaredDifference) REGISTER_KERNEL(Squeeze) REGISTER_KERNEL(Sub) +REGISTER_KERNEL(SVDF) REGISTER_KERNEL(Tanh) REGISTER_KERNEL(Transpose) REGISTER_KERNEL(TransposeConv) diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALAveragePool2d.h b/compiler/luci-interpreter/pal/cmsisnn/PALAveragePool2d.h new file mode 100644 index 000000000..a274afb7e --- /dev/null +++ b/compiler/luci-interpreter/pal/cmsisnn/PALAveragePool2d.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H +#define LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H + +#include <tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h> +#include <tensorflow/lite/kernels/internal/reference/pooling.h> +#include <arm_nn_types.h> +#include <arm_nnfunctions.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void AveragePool(const tflite::PoolParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &output_shape, T *output_data, + const tflite::RuntimeShape &scratchpad_shape, T *scratchpad_data) +{ + { + // MARK: At this moment this operation is not supported + assert(false && "AveragePool NYI"); + (void)params; + (void)input_shape; + (void)input_data; + (void)output_shape; + (void)output_data; + (void)scratchpad_shape; + (void)scratchpad_data; + } +} + +template <> +inline void AveragePool<int8_t>(const tflite::PoolParams ¶ms, + const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, + const tflite::RuntimeShape &scratchpad_shape, + int8_t *scratchpad_data) +{ + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + assert(scratchpad_data != nullptr); + + const int32_t batches = tflite::MatchingDim(input_shape, 0, output_shape, 0); + assert(batches == 1); + + const int depth = tflite::MatchingDim(input_shape, 3, output_shape, 3); + + cmsis_nn_dims input_dims; + input_dims.n = 1; + input_dims.h = input_shape.Dims(1); + input_dims.w = input_shape.Dims(2); + input_dims.c = depth; + + cmsis_nn_dims output_dims; + output_dims.n = 1; + output_dims.h = output_shape.Dims(1); + output_dims.w = output_shape.Dims(2); + output_dims.c = depth; + + cmsis_nn_pool_params pool_params; + pool_params.stride.h = params.stride_height; + pool_params.stride.w = params.stride_width; + pool_params.padding.h = params.padding_values.height; + pool_params.padding.w = params.padding_values.width; + pool_params.activation.min = params.quantized_activation_min; + pool_params.activation.max = params.quantized_activation_max; + + cmsis_nn_dims filter_dims; + filter_dims.n = 1; + filter_dims.h = params.filter_height; + filter_dims.w = params.filter_width; + filter_dims.c = 1; + + cmsis_nn_context ctx; + ctx.buf = scratchpad_data; + ctx.size = scratchpad_shape.Dims(0); + auto res = arm_avgpool_s8(&ctx, &pool_params, &input_dims, input_data, &filter_dims, &output_dims, + output_data); + assert(res == ARM_MATH_SUCCESS); +} + +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const luci_interpreter::DataType &input_data_type, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &output_shape) + +{ + if (input_data_type == luci_interpreter::DataType::S8) + { + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + const int32_t output_width = output_shape.Dims(2); + const int32_t depth = tflite::MatchingDim(input_shape, 3, output_shape, 3); + + const int32_t buf_size = arm_avgpool_s8_get_buffer_size(output_width, depth); + auto data_type_size = static_cast<int32_t>(luci_interpreter::getDataTypeSize(input_data_type)); + + luci_interpreter::Shape scratchpad_shape{buf_size * data_type_size}; + scratchpad->resize(scratchpad_shape); + } + else + { + scratchpad->set_allocatable(false); + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALConv2d.h b/compiler/luci-interpreter/pal/cmsisnn/PALConv2d.h index 0a8ae4e48..cfb84ea60 100644 --- a/compiler/luci-interpreter/pal/cmsisnn/PALConv2d.h +++ b/compiler/luci-interpreter/pal/cmsisnn/PALConv2d.h @@ -19,6 +19,8 @@ #include <tensorflow/lite/kernels/internal/reference/conv.h> #include <tensorflow/lite/kernels/internal/reference/integer_ops/conv.h> +#include <arm_nn_types.h> +#include <arm_nnfunctions.h> namespace luci_interpreter_pal { @@ -26,11 +28,11 @@ static inline void Conv(const tflite::ConvParams ¶ms, const tflite::RuntimeS const float *input_data, const tflite::RuntimeShape &filter_shape, const float *filter_data, const tflite::RuntimeShape &bias_shape, const float *bias_data, const tflite::RuntimeShape &output_shape, - float *output_data, const tflite::RuntimeShape &im2col_shape, - float *im2col_data) + float *output_data, const tflite::RuntimeShape &scratchpad_shape, + float *scratchpad_data) { - (void)im2col_shape; - (void)im2col_data; + (void)scratchpad_shape; + (void)scratchpad_data; tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data, tflite::RuntimeShape(), nullptr); @@ -40,14 +42,14 @@ static inline void Conv(const tflite::ConvParams ¶ms, const tflite::RuntimeS const uint8 *input_data, const tflite::RuntimeShape &filter_shape, const uint8 *filter_data, const tflite::RuntimeShape &bias_shape, const int32 *bias_data, const tflite::RuntimeShape &output_shape, - uint8 *output_data, const tflite::RuntimeShape &im2col_shape, - uint8 *im2col_data) + uint8 *output_data, const tflite::RuntimeShape &scratchpad_shape, + uint8 *scratchpad_data) { - (void)im2col_shape; - (void)im2col_data; + (void)scratchpad_shape; + (void)scratchpad_data; tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data, - bias_shape, bias_data, output_shape, output_data, im2col_shape, - im2col_data, nullptr); + bias_shape, bias_data, output_shape, output_data, scratchpad_shape, + scratchpad_data, nullptr); } static inline void ConvPerChannel(const tflite::ConvParams ¶ms, const int32_t *mult, @@ -55,14 +57,141 @@ static inline void ConvPerChannel(const tflite::ConvParams ¶ms, const int32_ const int8 *input_data, const tflite::RuntimeShape &filter_shape, const int8 *filter_data, const tflite::RuntimeShape &bias_shape, const int32 *bias_data, const tflite::RuntimeShape &output_shape, - int8 *output_data, const tflite::RuntimeShape &im2col_shape, - int8 *im2col_data) + int8 *output_data, const tflite::RuntimeShape &scratchpad_shape, + int8 *scratchpad_data) { - (void)im2col_shape; - (void)im2col_data; - tflite::reference_integer_ops::ConvPerChannel(params, mult, shifts, input_shape, input_data, - filter_shape, filter_data, bias_shape, bias_data, - output_shape, output_data); + if (scratchpad_data) + { + cmsis_nn_conv_params conv_params; + conv_params.dilation.h = params.dilation_height_factor; + conv_params.dilation.w = params.dilation_width_factor; + + assert(conv_params.dilation.h == 1); + assert(conv_params.dilation.w == 1); + + conv_params.input_offset = params.input_offset; + conv_params.output_offset = params.output_offset; + conv_params.stride.h = params.stride_height; + conv_params.stride.w = params.stride_width; + conv_params.padding.h = params.padding_values.height; + conv_params.padding.w = params.padding_values.width; + conv_params.activation.min = params.quantized_activation_min; + conv_params.activation.max = params.quantized_activation_max; + + cmsis_nn_per_channel_quant_params quant_params; + quant_params.multiplier = const_cast<int32_t *>(mult); + quant_params.shift = const_cast<int32_t *>(shifts); + + assert(conv_params.activation.min <= conv_params.activation.max); + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batch_size = tflite::MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = tflite::MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = tflite::MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) + { + assert(bias_shape.FlatSize() == output_depth); + } + + cmsis_nn_dims input_dims; + input_dims.n = batch_size; + input_dims.h = input_shape.Dims(1); + input_dims.w = input_shape.Dims(2); + input_dims.c = input_depth; + + cmsis_nn_dims filter_dims; + filter_dims.n = output_depth; + filter_dims.h = filter_shape.Dims(1); + filter_dims.w = filter_shape.Dims(2); + filter_dims.c = input_depth; + + cmsis_nn_dims bias_dims; + bias_dims.n = 1; + bias_dims.h = 1; + bias_dims.w = 1; + bias_dims.c = output_depth; + + cmsis_nn_dims output_dims; + output_dims.n = batch_size; + output_dims.h = output_shape.Dims(1); + output_dims.w = output_shape.Dims(2); + output_dims.c = output_depth; + + cmsis_nn_context ctx; + ctx.buf = scratchpad_data; + ctx.size = scratchpad_shape.Dims(0); + + auto res = arm_convolve_wrapper_s8(&ctx, &conv_params, &quant_params, &input_dims, input_data, + &filter_dims, filter_data, &bias_dims, bias_data, + &output_dims, output_data); + assert(res == ARM_MATH_SUCCESS); + } + else + { + tflite::reference_integer_ops::ConvPerChannel(params, mult, shifts, input_shape, input_data, + filter_shape, filter_data, bias_shape, bias_data, + output_shape, output_data); + } +} + +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const luci_interpreter::DataType &input_data_type, + const tflite::ConvParams ¶ms, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &filter_shape, + const tflite::RuntimeShape &output_shape) +{ + cmsis_nn_conv_params conv_params; + conv_params.dilation.h = params.dilation_height_factor; + conv_params.dilation.w = params.dilation_width_factor; + + if (input_data_type == loco::DataType::S8 && conv_params.dilation.h == 1 && + conv_params.dilation.w == 1) + { + const int32_t batches = tflite::MatchingDim(input_shape, 0, output_shape, 0); + const int32_t input_depth = tflite::MatchingDim(input_shape, 3, filter_shape, 3); + const int32_t output_depth = tflite::MatchingDim(filter_shape, 0, output_shape, 3); + const int32_t filter_height = filter_shape.Dims(1); + const int32_t filter_width = filter_shape.Dims(2); + const int32_t output_height = output_shape.Dims(1); + const int32_t output_width = output_shape.Dims(2); + + conv_params.input_offset = params.input_offset; + conv_params.output_offset = params.output_offset; + conv_params.stride.h = params.stride_height; + conv_params.stride.w = params.stride_width; + conv_params.padding.h = params.padding_values.height; + conv_params.padding.w = params.padding_values.width; + + cmsis_nn_dims input_dims; + input_dims.n = batches; + input_dims.h = input_shape.Dims(1); + input_dims.w = input_shape.Dims(2); + input_dims.c = input_depth; + + cmsis_nn_dims filter_dims; + filter_dims.n = output_depth; + filter_dims.h = filter_height; + filter_dims.w = filter_width; + filter_dims.c = input_depth; + + cmsis_nn_dims output_dims; + output_dims.n = batches; + output_dims.h = output_height; + output_dims.w = output_width; + output_dims.c = output_depth; + + const int32_t buf_size = arm_convolve_wrapper_s8_get_buffer_size(&conv_params, &input_dims, + &filter_dims, &output_dims); + + luci_interpreter::Shape scratchpad_shape{buf_size}; + scratchpad->resize(scratchpad_shape); + } + else + { + scratchpad->set_allocatable(false); + } } } // namespace luci_interpreter_pal diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALDepthwiseConv2d.h b/compiler/luci-interpreter/pal/cmsisnn/PALDepthwiseConv2d.h new file mode 100644 index 000000000..120dcd803 --- /dev/null +++ b/compiler/luci-interpreter/pal/cmsisnn/PALDepthwiseConv2d.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H +#define LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H + +#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h> +#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h> +#include <tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h> +#include <arm_nnfunctions.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void +DepthwiseConvPerChannel(const tflite::DepthwiseParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const tflite::RuntimeShape &input_shape, + const T *input_data, const tflite::RuntimeShape &filter_shape, + const T *filter_data, const tflite::RuntimeShape &bias_shape, + const int32_t *bias_data, const tflite::RuntimeShape &output_shape, + T *output_data, const tflite::RuntimeShape &scratchpad_shape, + T *scratchpad_data) +{ + { + // MARK: At this moment this operation is not supported + assert(false && "DepthwiseConvPerChannel NYI"); + (void)params; + (void)output_multiplier; + (void)output_shift; + (void)input_shape; + (void)output_data; + (void)input_data; + (void)filter_shape; + (void)filter_data; + (void)bias_shape; + (void)bias_data; + (void)output_shape; + (void)output_data; + (void)scratchpad_shape; + (void)scratchpad_data; + } +} + +template <> +inline void DepthwiseConvPerChannel<int8_t>( + const tflite::DepthwiseParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, + const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data) +{ + if (scratchpad_data) + { + cmsis_nn_dw_conv_params dw_conv_params; + dw_conv_params.dilation.h = params.dilation_height_factor; + dw_conv_params.dilation.w = params.dilation_width_factor; + assert(dw_conv_params.dilation.h == 1); + assert(dw_conv_params.dilation.w == 1); + + dw_conv_params.input_offset = params.input_offset; + dw_conv_params.output_offset = params.output_offset; + dw_conv_params.stride.h = params.stride_height; + dw_conv_params.stride.w = params.stride_width; + dw_conv_params.padding.h = params.padding_values.height; + dw_conv_params.padding.w = params.padding_values.width; + + dw_conv_params.activation.min = params.quantized_activation_min; + dw_conv_params.activation.max = params.quantized_activation_max; + dw_conv_params.ch_mult = params.depth_multiplier; + + cmsis_nn_per_channel_quant_params quant_params; + int32_t output_multiplier = params.output_multiplier; + int32_t output_shift = params.output_shift; + + quant_params.multiplier = &output_multiplier; + quant_params.shift = &output_shift; + + assert(dw_conv_params.activation.min <= dw_conv_params.activation.max); + const int batch_size = tflite::MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = tflite::MatchingDim(filter_shape, 3, output_shape, 3); + if (bias_data) + { + assert(bias_shape.FlatSize() == output_depth); + } + + cmsis_nn_dims input_dims; + input_dims.n = batch_size; + input_dims.h = input_shape.Dims(1); + input_dims.w = input_shape.Dims(2); + input_dims.c = input_shape.Dims(3); + + cmsis_nn_dims filter_dims; + filter_dims.n = filter_shape.Dims(0); + filter_dims.h = filter_shape.Dims(1); + filter_dims.w = filter_shape.Dims(2); + filter_dims.c = output_depth; + + cmsis_nn_dims bias_dims; + bias_dims.n = 1; + bias_dims.h = 1; + bias_dims.w = 1; + bias_dims.c = output_depth; + + cmsis_nn_dims output_dims; + output_dims.n = batch_size; + output_dims.h = output_shape.Dims(1); + output_dims.w = output_shape.Dims(2); + output_dims.c = output_depth; + + cmsis_nn_context ctx; + ctx.buf = scratchpad_data; + ctx.size = scratchpad_shape.Dims(0); + + auto res = arm_depthwise_conv_wrapper_s8(&ctx, &dw_conv_params, &quant_params, &input_dims, + input_data, &filter_dims, filter_data, &bias_dims, + bias_data, &output_dims, output_data); + assert(res == ARM_MATH_SUCCESS); + } + else + { + tflite::reference_integer_ops::DepthwiseConvPerChannel( + params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data); + } +} + +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const tflite::DepthwiseParams ¶ms, + const luci_interpreter::DataType &input_data_type, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &filter_shape, + const tflite::RuntimeShape &output_shape) +{ + cmsis_nn_dw_conv_params dw_conv_params; + dw_conv_params.dilation.h = params.dilation_height_factor; + dw_conv_params.dilation.w = params.dilation_width_factor; + + if (input_data_type == loco::DataType::S8 && dw_conv_params.dilation.h == 1 && + dw_conv_params.dilation.w == 1) + { + const int batch_size = tflite::MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = tflite::MatchingDim(filter_shape, 3, output_shape, 3); + + cmsis_nn_dims input_dims; + input_dims.n = batch_size; + input_dims.h = input_shape.Dims(1); + input_dims.w = input_shape.Dims(2); + input_dims.c = input_shape.Dims(3); + + cmsis_nn_dims filter_dims; + filter_dims.n = filter_shape.Dims(0); + filter_dims.h = filter_shape.Dims(1); + filter_dims.w = filter_shape.Dims(2); + filter_dims.c = output_depth; + + cmsis_nn_dims output_dims; + output_dims.n = batch_size; + output_dims.h = output_shape.Dims(1); + output_dims.w = output_shape.Dims(2); + output_dims.c = output_depth; + + const int32_t buf_size = arm_depthwise_conv_wrapper_s8_get_buffer_size( + &dw_conv_params, &input_dims, &filter_dims, &output_dims); + + auto data_type_size = static_cast<int32_t>(luci_interpreter::getDataTypeSize(input_data_type)); + + luci_interpreter::Shape scratchpad_shape{buf_size * data_type_size}; + scratchpad->resize(scratchpad_shape); + } + else + { + scratchpad->set_allocatable(false); + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALDequantize.h b/compiler/luci-interpreter/pal/cmsisnn/PALDequantize.h new file mode 100644 index 000000000..15ff0327b --- /dev/null +++ b/compiler/luci-interpreter/pal/cmsisnn/PALDequantize.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_DEQUANTIZE_H +#define LUCI_INTERPRETER_PAL_DEQUANTIZE_H + +#include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h" +#include "tensorflow/lite/kernels/internal/reference/reference_ops.h" + +namespace luci_interpreter_pal +{ + +template <typename T> +static inline void Dequantize(tflite::DequantizationParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + tflite::reference_integer_ops::Dequantize<T>(params, input_shape, input_data, output_shape, + output_data); +} + +static inline void Dequantize(tflite::DequantizationParams ¶ms, + const tflite::RuntimeShape &input_shape, const uint8_t *input_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + tflite::reference_ops::Dequantize(params, input_shape, input_data, output_shape, output_data); +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_DEQUANTIZE_H diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALFullyConnected.h b/compiler/luci-interpreter/pal/cmsisnn/PALFullyConnected.h new file mode 100644 index 000000000..32e905761 --- /dev/null +++ b/compiler/luci-interpreter/pal/cmsisnn/PALFullyConnected.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_FULLYCONNECTED_H +#define LUCI_INTERPRETER_PAL_FULLYCONNECTED_H + +#include <tensorflow/lite/kernels/internal/reference/fully_connected.h> +#include <tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h> +#include <arm_nnfunctions.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void FullyConnected(const tflite::FullyConnectedParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &filter_shape, const T *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, T *output_data) +{ + { + // MARK: At this moment this operation doesn't support + assert(false && "FullyConnected NYI"); + (void)params; + (void)input_shape; + (void)input_data; + (void)filter_shape; + (void)filter_data; + (void)bias_shape; + (void)bias_data; + (void)output_shape; + (void)output_data; + } +} + +template <> +inline void +FullyConnected<int8_t>(const tflite::FullyConnectedParams ¶ms, + const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data) +{ + assert(output_shape.DimensionsCount() == 2); + + const int batches = output_shape.Dims(0); + const int output_depth = output_shape.Dims(1); + + const int filter_dim_count = filter_shape.DimensionsCount(); + const int accum_depth = filter_shape.Dims(filter_dim_count - 1); + + cmsis_nn_fc_params fc_params; + fc_params.input_offset = params.input_offset; + fc_params.output_offset = params.output_offset; + fc_params.filter_offset = params.weights_offset; + fc_params.activation.min = params.quantized_activation_min; + fc_params.activation.max = params.quantized_activation_max; + + cmsis_nn_per_tensor_quant_params quant_params; + quant_params.multiplier = params.output_multiplier; + quant_params.shift = params.output_shift; + + cmsis_nn_dims input_dims; + input_dims.n = batches; + input_dims.h = 1; + input_dims.w = 1; + input_dims.c = accum_depth; + + cmsis_nn_dims filter_dims; + filter_dims.n = accum_depth; + filter_dims.h = 1; + filter_dims.w = 1; + filter_dims.c = output_depth; + + cmsis_nn_dims bias_dims; + bias_dims.n = 1; + bias_dims.h = 1; + bias_dims.w = 1; + bias_dims.c = output_depth; + + cmsis_nn_dims output_dims; + output_dims.n = batches; + output_dims.h = 1; + output_dims.w = 1; + output_dims.c = output_depth; + + int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims); + auto buffer = std::make_unique<int8_t[]>(buf_size); + assert(buffer != nullptr); + + cmsis_nn_context ctx; + ctx.buf = buffer.get(); + ctx.size = buf_size; + + auto res = + arm_fully_connected_s8(&ctx, &fc_params, &quant_params, &input_dims, input_data, &filter_dims, + filter_data, &bias_dims, bias_data, &output_dims, output_data); + assert(res == ARM_MATH_SUCCESS); +} +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_FULLYCONNECTED_H diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALMul.h b/compiler/luci-interpreter/pal/cmsisnn/PALMul.h index 2b46b100c..347a97a83 100644 --- a/compiler/luci-interpreter/pal/cmsisnn/PALMul.h +++ b/compiler/luci-interpreter/pal/cmsisnn/PALMul.h @@ -21,21 +21,21 @@ namespace luci_interpreter_pal { +template <typename T> static inline void Mul(tflite::ArithmeticParams ¶ms, const tflite::RuntimeShape &input1_shape, - const float *input1_data, const tflite::RuntimeShape &input2_shape, - const float *input2_data, const tflite::RuntimeShape &output_shape, - float *output_data) + const T *input1_data, const tflite::RuntimeShape &input2_shape, + const T *input2_data, const tflite::RuntimeShape &output_shape, + T *output_data) { tflite::reference_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); } -static inline void BroadcastMul4DSlow(tflite::ArithmeticParams ¶ms, - const tflite::RuntimeShape &input1_shape, - const float *input1_data, - const tflite::RuntimeShape &input2_shape, - const float *input2_data, - const tflite::RuntimeShape &output_shape, float *output_data) +template <typename T> +static inline void +BroadcastMul4DSlow(tflite::ArithmeticParams ¶ms, const tflite::RuntimeShape &input1_shape, + const T *input1_data, const tflite::RuntimeShape &input2_shape, + const T *input2_data, const tflite::RuntimeShape &output_shape, T *output_data) { tflite::reference_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALQuantize.h b/compiler/luci-interpreter/pal/cmsisnn/PALQuantize.h new file mode 100644 index 000000000..6046789ae --- /dev/null +++ b/compiler/luci-interpreter/pal/cmsisnn/PALQuantize.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_QUANTIZE_H +#define LUCI_INTERPRETER_PAL_QUANTIZE_H + +#include "tensorflow/lite/kernels/internal/reference/reference_ops.h" + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void Quantize(tflite::QuantizationParams ¶ms, + const tflite::RuntimeShape &input_shape, const float *input_data, + const tflite::RuntimeShape &output_shape, T *output_data) +{ + tflite::reference_ops::AffineQuantize(params, input_shape, input_data, output_shape, output_data); +} + +template <typename Input, typename Output> +static inline void Requantize(const Input *input_data, int32_t size, + int32_t effective_scale_multiplier, int32_t effective_scale_shift, + int32_t input_zero_point, int32_t output_zero_point, + Output *output_data) +{ + tflite::reference_ops::Requantize(input_data, size, effective_scale_multiplier, + effective_scale_shift, input_zero_point, output_zero_point, + output_data); +} +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_QUANTIZE_H diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALSVDF.h b/compiler/luci-interpreter/pal/cmsisnn/PALSVDF.h new file mode 100644 index 000000000..a4a5b2a78 --- /dev/null +++ b/compiler/luci-interpreter/pal/cmsisnn/PALSVDF.h @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2020 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_SVDF_H +#define LUCI_INTERPRETER_PAL_SVDF_H + +#include <arm_nn_types.h> +#include <arm_nnfunctions.h> + +namespace luci_interpreter_pal +{ +static inline void +IntegerSVDF(const TfLiteSVDFParams ¶ms, const tflite::RuntimeShape &input_shape, + const int8_t *input_data, const tflite::RuntimeShape &weight_feature_shape, + const int8_t *weight_feature_data, const tflite::RuntimeShape &weight_time_shape, + const int16_t *weight_time_data, const tflite::RuntimeShape &bias_shape, + const int32_t *bias_data, int16_t *activation_state_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, int32_t *scratchpad_data, + int32_t *output_temp_data, int32_t scale_1_a, int scale_1_b, int32_t scale_2_a, + int scale_2_b, int32_t input_zp, int32_t output_zp) +{ + const int32_t rank = params.rank; + const int32_t batch_size = input_shape.Dims(0); + const int32_t num_filters = weight_feature_shape.Dims(0); + const int32_t memory_size = weight_time_shape.Dims(1); + + cmsis_nn_dims input_dims; + input_dims.n = input_shape.Dims(0); + input_dims.h = input_shape.Dims(1); + + cmsis_nn_dims weights_feature_dims; + weights_feature_dims.n = weight_feature_shape.Dims(0); + weights_feature_dims.h = weight_feature_shape.Dims(1); + + cmsis_nn_dims weights_time_dims; + weights_time_dims.n = weight_time_shape.Dims(0); + weights_time_dims.h = weight_time_shape.Dims(1); + + cmsis_nn_dims bias_dims; + bias_dims.n = bias_shape.Dims(0); + + cmsis_nn_dims state_dims; + state_dims.n = batch_size; + state_dims.h = memory_size * num_filters; + + cmsis_nn_dims output_dims; + output_dims.n = output_shape.Dims(0); + output_dims.h = output_shape.Dims(1); + + cmsis_nn_svdf_params svdf_params; + svdf_params.rank = params.rank; + svdf_params.input_offset = input_zp; + svdf_params.output_offset = output_zp; + + svdf_params.input_activation.min = INT16_MIN; + svdf_params.input_activation.max = INT16_MAX; + + svdf_params.output_activation.min = INT8_MIN; + svdf_params.output_activation.max = INT8_MAX; + + cmsis_nn_per_tensor_quant_params in_quant_params; + in_quant_params.multiplier = scale_1_a; + in_quant_params.shift = scale_1_b; + + cmsis_nn_per_tensor_quant_params out_quant_params; + out_quant_params.multiplier = scale_2_a; + out_quant_params.shift = scale_2_b; + + cmsis_nn_context scratch_ctx; + scratch_ctx.buf = scratchpad_data; + + cmsis_nn_context scratch_output_ctx; + scratch_output_ctx.buf = output_temp_data; + + arm_svdf_s8(&scratch_ctx, &scratch_output_ctx, &svdf_params, &in_quant_params, &out_quant_params, + &input_dims, input_data, &state_dims, activation_state_data, &weights_feature_dims, + weight_feature_data, &weights_time_dims, weight_time_data, &bias_dims, bias_data, + &output_dims, output_data); +} +static inline void +FloatSVDF(const TfLiteSVDFParams ¶ms, const tflite::RuntimeShape &input_shape, + const float *input_data, const tflite::RuntimeShape &weight_feature_shape, + const float *weight_feature_data, const tflite::RuntimeShape &weight_time_shape, + const float *weight_time_data, const tflite::RuntimeShape &bias_shape, + const float *bias_data, float *scratchpad_data, float *activation_state_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + const int32_t rank = params.rank; + const int32_t batch_size = input_shape.Dims(0); + const int32_t input_size = input_shape.Dims(1); + const int32_t num_filters = weight_feature_shape.Dims(0); + const int32_t num_units = num_filters / rank; + const int32_t memory_size = weight_time_shape.Dims(1); + + // Left shift the activation_state. + { + float *new_state_start = activation_state_data; + const float *old_state_start = activation_state_data + 1; + const float *old_state_end = activation_state_data + batch_size * num_filters * memory_size; + while (old_state_start != old_state_end) + { + *new_state_start++ = *old_state_start++; + } + } + + // Note: no need to clear the latest activation, matmul is not accumulative. + + // Compute conv1d(inputs, weights_feature). + // The activation_state's rightmost column is used to save current cycle + // activation. This is achieved by starting at state_ptr[memory_size - 1] and + // having the stride equal to memory_size. + + // Perform batched matrix vector multiply operation: + { + const float *matrix = weight_feature_data; + const float *vector = input_data; + float *result = &activation_state_data[memory_size - 1]; + float *result_in_batch = result; + for (int i = 0; i < batch_size; ++i) + { + const float *matrix_ptr = matrix; + for (int j = 0; j < num_filters; ++j) + { + float dot_prod = 0.0f; + const float *vector_in_batch = vector + i * input_size; + for (int k = 0; k < input_size; ++k) + { + dot_prod += *matrix_ptr++ * *vector_in_batch++; + } + *result_in_batch = dot_prod; + result_in_batch += memory_size; + } + } + } + + tflite::reference_ops::ApplyTimeWeightsBiasAndActivation( + batch_size, memory_size, num_filters, num_units, rank, weight_time_data, bias_data, + params.activation, activation_state_data, scratchpad_data, output_data); +} + +static inline void SetupScratchpadTensor( + const luci_interpreter::DataType &input_data_type, + const luci_interpreter::DataType &weight_feature_data_type, + luci_interpreter::Tensor *scratchpad_1, luci_interpreter::Tensor *scratchpad_2, + luci_interpreter::Tensor *scratchpad_3, luci_interpreter::Tensor *scratchpad_4, + luci_interpreter::Tensor *scratchpad_5, luci_interpreter::Tensor *scratchpad_6, + const luci_interpreter::Shape input_shape, const luci_interpreter::Shape weight_time_shape, + const int32_t batch_size, const int32_t num_filters, const int32_t num_units) +{ + if (input_data_type == loco::DataType::FLOAT32 && + (weight_feature_data_type == loco::DataType::S8 || + weight_feature_data_type == loco::DataType::U8)) + { + (void)input_shape; + (void)weight_time_shape; + (void)scratchpad_3; + (void)scratchpad_4; + (void)scratchpad_5; + (void)scratchpad_6; + + throw std::runtime_error("Hybrid type is not supported for cmsisnn"); + } + + // Resize scratchpad_1 tensor + scratchpad_1->resize({batch_size, num_filters}); + + if (input_data_type == loco::DataType::S8) + { + // Resize scratchpad_2 for full_integer op + scratchpad_2->resize({batch_size, num_units}); + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_SVDF_H diff --git a/compiler/luci-interpreter/pal/cmsisnn/pal.cmake b/compiler/luci-interpreter/pal/cmsisnn/pal.cmake index 9a25a3c5d..a68b363d9 100644 --- a/compiler/luci-interpreter/pal/cmsisnn/pal.cmake +++ b/compiler/luci-interpreter/pal/cmsisnn/pal.cmake @@ -42,9 +42,12 @@ macro(add_pal_to_target TGT) "${TensorFlowSource_DIR}") target_include_directories(${TGT} PRIVATE ${LUCI_INTERPRETER_PAL_DIR}) - set(PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc) + file(GLOB_RECURSE PAL_SOURCES "${CMSISSource_DIR}/CMSIS/NN/Source/*.c") + list(APPEND PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc + ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/tensor_utils.cc + ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc) add_library(luci_interpreter_cmsisnn_pal STATIC ${PAL_SOURCES}) - set_target_properties(luci_interpreter_cmsisnn_pal PROPERTIES POSITION_INDEPENDENT_CODE ON) + set_property(TARGET luci_interpreter_cmsisnn_pal PROPERTY POSITION_INDEPENDENT_CODE ON) target_include_directories(luci_interpreter_cmsisnn_pal PRIVATE "${TensorFlowRuySource_DIR}" "${TensorFlowGEMMLowpSource_DIR}" @@ -53,7 +56,7 @@ macro(add_pal_to_target TGT) ) add_subdirectory(${CMSISSource_DIR}/CMSIS/NN ${CMAKE_CURRENT_BINARY_DIR}/CMSISNN) - target_include_directories(luci_interpreter_cmsisnn_pal PRIVATE + target_include_directories(luci_interpreter_cmsisnn_pal PUBLIC "${CMSISSource_DIR}/CMSIS/NN/Include" "${CMSISSource_DIR}/CMSIS/DSP/Include" "${CMSISSource_DIR}/CMSIS/Core/Include") diff --git a/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst b/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst index 9d541276c..428b15ee0 100644 --- a/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst +++ b/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst @@ -1,19 +1,23 @@ REGISTER_KERNEL(Add) REGISTER_KERNEL(ArgMax) REGISTER_KERNEL(AveragePool2D) +REGISTER_KERNEL(BatchMatMul) REGISTER_KERNEL(BatchToSpaceND) REGISTER_KERNEL(Cast) REGISTER_KERNEL(Concatenation) REGISTER_KERNEL(Conv2D) REGISTER_KERNEL(DepthToSpace) REGISTER_KERNEL(DepthwiseConv2D) +REGISTER_KERNEL(Dequantize) REGISTER_KERNEL(Div) REGISTER_KERNEL(Elu) REGISTER_KERNEL(Exp) +REGISTER_KERNEL(ExpandDims) REGISTER_KERNEL(Floor) REGISTER_KERNEL(FloorDiv) REGISTER_KERNEL(Equal) REGISTER_KERNEL(FullyConnected) +REGISTER_KERNEL(Gather) REGISTER_KERNEL(Greater) REGISTER_KERNEL(GreaterEqual) REGISTER_KERNEL(If) @@ -37,11 +41,13 @@ REGISTER_KERNEL(MirrorPad) REGISTER_KERNEL(Mul) REGISTER_KERNEL(Neg) REGISTER_KERNEL(NotEqual) +REGISTER_KERNEL(OneHot) REGISTER_KERNEL(Pack) REGISTER_KERNEL(Pad) REGISTER_KERNEL(PadV2) REGISTER_KERNEL(Pow) REGISTER_KERNEL(PRelu) +REGISTER_KERNEL(Quantize) REGISTER_KERNEL(Relu) REGISTER_KERNEL(Relu6) REGISTER_KERNEL(Reshape) @@ -61,6 +67,7 @@ REGISTER_KERNEL(Square) REGISTER_KERNEL(SquaredDifference) REGISTER_KERNEL(Squeeze) REGISTER_KERNEL(Sub) +REGISTER_KERNEL(SVDF) REGISTER_KERNEL(Tanh) REGISTER_KERNEL(Transpose) REGISTER_KERNEL(TransposeConv) diff --git a/compiler/luci-interpreter/pal/linux/PALAveragePool2d.h b/compiler/luci-interpreter/pal/linux/PALAveragePool2d.h new file mode 100644 index 000000000..cce30601f --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/PALAveragePool2d.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H +#define LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H + +#include <tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h> +#include <tensorflow/lite/kernels/internal/reference/pooling.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void AveragePool(const tflite::PoolParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &output_shape, T *output_data, + const tflite::RuntimeShape &scratchpad_shape, T *scratchpad_data) +{ + { + // MARK: At this moment this operation doesn't support + assert(false && "AveragePool NYI"); + (void)params; + (void)input_shape; + (void)input_data; + (void)output_shape; + (void)output_data; + (void)scratchpad_shape; + (void)scratchpad_data; + } +} + +template <> +inline void AveragePool<int8_t>(const tflite::PoolParams ¶ms, + const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, + const tflite::RuntimeShape &scratchpad_shape, + int8_t *scratchpad_data) +{ + (void)scratchpad_shape; + (void)scratchpad_data; + + tflite::reference_integer_ops::AveragePool(params, input_shape, input_data, output_shape, + output_data); +} + +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const luci_interpreter::DataType &input_data_type, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &output_shape) + +{ + (void)input_data_type; + (void)input_shape; + (void)output_shape; + + scratchpad->set_allocatable(false); +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H diff --git a/compiler/luci-interpreter/pal/linux/PALBatchMatMul.h b/compiler/luci-interpreter/pal/linux/PALBatchMatMul.h new file mode 100644 index 000000000..3894f2d92 --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/PALBatchMatMul.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_BATCHMATMUL_H +#define LUCI_INTERPRETER_PAL_BATCHMATMUL_H + +#include <tensorflow/lite/kernels/internal/reference/batch_matmul.h> + +namespace luci_interpreter_pal +{ +inline void BatchMatMul(const tflite::RuntimeShape &lhs_shape, const float *lhs_data, + const tflite::RuntimeShape &rhs_shape, const float *rhs_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + tflite::reference_ops::BatchMatMul(lhs_shape, lhs_data, rhs_shape, rhs_data, output_shape, + output_data); +} + +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *lhs_scratchpad, + luci_interpreter::Tensor *rhs_scratchpad, + const tflite::RuntimeShape &lhs_shape, + const tflite::RuntimeShape &rhs_shape) +{ + // Scratchpad for transposed LHS + { + auto lhs_rank = lhs_shape.DimensionsCount(); + luci_interpreter::Shape scratchpad_size(lhs_rank); + for (int i = 0; i < lhs_rank - 2; ++i) + { + scratchpad_size.dim(i) = lhs_shape.Dims(i); + } + scratchpad_size.dim(lhs_rank - 2) = lhs_shape.Dims(lhs_rank - 1); + scratchpad_size.dim(lhs_rank - 1) = lhs_shape.Dims(lhs_rank - 2); + + lhs_scratchpad->resize(scratchpad_size); + } + // Scratchpad for transposed RHS + { + auto rhs_rank = rhs_shape.DimensionsCount(); + luci_interpreter::Shape scratchpad_size(rhs_rank); + for (int i = 0; i < rhs_rank - 2; ++i) + { + scratchpad_size.dim(i) = rhs_shape.Dims(i); + } + scratchpad_size.dim(rhs_rank - 2) = rhs_shape.Dims(rhs_rank - 1); + scratchpad_size.dim(rhs_rank - 1) = rhs_shape.Dims(rhs_rank - 2); + + rhs_scratchpad->resize(scratchpad_size); + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_BATCHMATMUL_H diff --git a/compiler/luci-interpreter/pal/linux/PALConv2d.h b/compiler/luci-interpreter/pal/linux/PALConv2d.h index 2550dd5d7..985a15f39 100644 --- a/compiler/luci-interpreter/pal/linux/PALConv2d.h +++ b/compiler/luci-interpreter/pal/linux/PALConv2d.h @@ -26,14 +26,24 @@ static inline void Conv(const tflite::ConvParams ¶ms, const tflite::RuntimeS const float *input_data, const tflite::RuntimeShape &filter_shape, const float *filter_data, const tflite::RuntimeShape &bias_shape, const float *bias_data, const tflite::RuntimeShape &output_shape, - float *output_data, const tflite::RuntimeShape &im2col_shape, - float *im2col_data) + float *output_data, const tflite::RuntimeShape &scratchpad_shape, + float *scratchpad_data) { - if (im2col_data) + (void)scratchpad_shape; + if (scratchpad_data) { + const int32_t batches = tflite::MatchingDim(input_shape, 0, output_shape, 0); + const int32_t input_depth = tflite::MatchingDim(input_shape, 3, filter_shape, 3); + const int32_t output_height = output_shape.Dims(1); + const int32_t output_width = output_shape.Dims(2); + const int32_t filter_height = filter_shape.Dims(1); + const int32_t filter_width = filter_shape.Dims(2); + tflite::RuntimeShape im2col_shape{batches, output_height, output_width, + input_depth * filter_height * filter_width}; + tflite::optimized_ops::Conv(params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data, im2col_shape, - im2col_data); + scratchpad_data); } else tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data, @@ -45,8 +55,8 @@ static inline void Conv(const tflite::ConvParams ¶ms, const tflite::RuntimeS const uint8 *input_data, const tflite::RuntimeShape &filter_shape, const uint8 *filter_data, const tflite::RuntimeShape &bias_shape, const int32 *bias_data, const tflite::RuntimeShape &output_shape, - uint8 *output_data, const tflite::RuntimeShape &im2col_shape, - uint8 *im2col_data) + uint8 *output_data, const tflite::RuntimeShape &scratchpad_shape, + uint8 *scratchpad_data) { // TODO This should only be done once (although it takes only a few microseconds). // Also, the user should be able to adjust the number of threads. @@ -54,8 +64,8 @@ static inline void Conv(const tflite::ConvParams ¶ms, const tflite::RuntimeS gemmlowp_context->set_max_num_threads(static_cast<int>(std::thread::hardware_concurrency())); tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data, - bias_shape, bias_data, output_shape, output_data, im2col_shape, - im2col_data, gemmlowp_context.get()); + bias_shape, bias_data, output_shape, output_data, scratchpad_shape, + scratchpad_data, gemmlowp_context.get()); } static inline void ConvPerChannel(const tflite::ConvParams ¶ms, const int32_t *mult, @@ -63,17 +73,55 @@ static inline void ConvPerChannel(const tflite::ConvParams ¶ms, const int32_ const int8 *input_data, const tflite::RuntimeShape &filter_shape, const int8 *filter_data, const tflite::RuntimeShape &bias_shape, const int32 *bias_data, const tflite::RuntimeShape &output_shape, - int8 *output_data, const tflite::RuntimeShape &im2col_shape, - int8 *im2col_data) + int8 *output_data, const tflite::RuntimeShape &scratchpad_shape, + int8 *scratchpad_data) { - (void)im2col_shape; - (void)im2col_data; + (void)scratchpad_shape; + (void)scratchpad_data; // TODO enable optimized version tflite::reference_integer_ops::ConvPerChannel(params, mult, shifts, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data); } +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const luci_interpreter::DataType &input_data_type, + const tflite::ConvParams ¶ms, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &filter_shape, + const tflite::RuntimeShape &output_shape) +{ + const int32_t filter_height = filter_shape.Dims(1); + const int32_t filter_width = filter_shape.Dims(2); + + // Allocate tensor for scratchpad, if needed. + // The checks here should be aligned with the actual implementation. + const bool need_dilated_scratchpad = + params.dilation_height_factor != 1 || params.dilation_width_factor != 1; + const bool need_non_dilated_scratchpad = params.stride_height != 1 || params.stride_width != 1 || + filter_height != 1 || filter_width != 1; + auto _need_scratchpad = input_data_type != luci_interpreter::DataType::S16 && + (need_dilated_scratchpad || need_non_dilated_scratchpad); + + if (_need_scratchpad) + { + const int32_t batches = tflite::MatchingDim(input_shape, 0, output_shape, 0); + const int32_t input_depth = tflite::MatchingDim(input_shape, 3, filter_shape, 3); + const int32_t output_height = output_shape.Dims(1); + const int32_t output_width = output_shape.Dims(2); + + auto data_type_size = static_cast<int32_t>(luci_interpreter::getDataTypeSize(input_data_type)); + int32_t scratchpad_size = batches * output_width * output_height * input_depth * filter_height * + filter_width * data_type_size; + luci_interpreter::Shape scratchpad_shape{scratchpad_size}; + scratchpad->resize(scratchpad_shape); + } + else + { + scratchpad->set_allocatable(false); + } +} + } // namespace luci_interpreter_pal #endif // LUCI_INTERPRETER_PAL_CONV2D_H diff --git a/compiler/luci-interpreter/pal/linux/PALDepthwiseConv2d.h b/compiler/luci-interpreter/pal/linux/PALDepthwiseConv2d.h new file mode 100644 index 000000000..c9d1a2948 --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/PALDepthwiseConv2d.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H +#define LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H + +#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h> +#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h> +#include <tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void +DepthwiseConvPerChannel(const tflite::DepthwiseParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const tflite::RuntimeShape &input_shape, + const T *input_data, const tflite::RuntimeShape &filter_shape, + const T *filter_data, const tflite::RuntimeShape &bias_shape, + const int32_t *bias_data, const tflite::RuntimeShape &output_shape, + T *output_data, const tflite::RuntimeShape &scratchpad_shape, + T *scratchpad_data) +{ + { + // MARK: At this moment this operation is not supported + assert(false && "DepthwiseConvPerChannel NYI"); + (void)params; + (void)output_multiplier; + (void)output_shift; + (void)input_shape; + (void)output_data; + (void)input_data; + (void)filter_shape; + (void)filter_data; + (void)bias_shape; + (void)bias_data; + (void)output_shape; + (void)output_data; + (void)scratchpad_shape; + (void)scratchpad_data; + } +} + +template <> +inline void DepthwiseConvPerChannel<int8_t>( + const tflite::DepthwiseParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, + const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data) +{ + (void)scratchpad_shape; + (void)scratchpad_data; + tflite::reference_integer_ops::DepthwiseConvPerChannel( + params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data); +} + +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const tflite::DepthwiseParams ¶ms, + const luci_interpreter::DataType &input_data_type, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &filter_shape, + const tflite::RuntimeShape &output_shape) + +{ + (void)params; + (void)input_data_type; + (void)input_shape; + (void)filter_shape; + (void)output_shape; + + scratchpad->set_allocatable(false); +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H diff --git a/compiler/luci-interpreter/pal/linux/PALDequantize.h b/compiler/luci-interpreter/pal/linux/PALDequantize.h new file mode 100644 index 000000000..3af6d0777 --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/PALDequantize.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_DEQUANTIZE_H +#define LUCI_INTERPRETER_PAL_DEQUANTIZE_H + +#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void Dequantize(tflite::DequantizationParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + tflite::optimized_ops::Dequantize(params, input_shape, input_data, output_shape, output_data); +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_DEQUANTIZE_H diff --git a/compiler/luci-interpreter/pal/linux/PALFullyConnected.h b/compiler/luci-interpreter/pal/linux/PALFullyConnected.h new file mode 100644 index 000000000..62970dbf7 --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/PALFullyConnected.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_FULLYCONNECTED_H +#define LUCI_INTERPRETER_PAL_FULLYCONNECTED_H + +#include <tensorflow/lite/kernels/internal/reference/fully_connected.h> +#include <tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void FullyConnected(const tflite::FullyConnectedParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &filter_shape, const T *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, T *output_data) +{ + { + // MARK: At this moment this operation doesn't support + assert(false && "FullyConnected NYI"); + (void)params; + (void)input_shape; + (void)input_data; + (void)filter_shape; + (void)filter_data; + (void)bias_shape; + (void)bias_data; + (void)output_shape; + (void)output_data; + } +} + +template <> +inline void +FullyConnected<int8_t>(const tflite::FullyConnectedParams ¶ms, + const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data) +{ + tflite::reference_integer_ops::FullyConnected(params, input_shape, input_data, filter_shape, + filter_data, bias_shape, bias_data, output_shape, + output_data); +} +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_FULLYCONNECTED_H diff --git a/compiler/luci-interpreter/pal/linux/PALGather.h b/compiler/luci-interpreter/pal/linux/PALGather.h new file mode 100644 index 000000000..49ac35f93 --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/PALGather.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_GATHER_H +#define LUCI_INTERPRETER_PAL_GATHER_H + +#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h> + +namespace luci_interpreter_pal +{ +template <typename T, typename CoordsT = int32> +static inline void Gather(const tflite::GatherParams &op_params, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &coords_shape, const CoordsT *coords_data, + const tflite::RuntimeShape &output_shape, T *output_data) +{ + tflite::optimized_ops::Gather(op_params, input_shape, input_data, coords_shape, coords_data, + output_shape, output_data); +} +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_GATHER_H diff --git a/compiler/luci-interpreter/pal/linux/PALMul.h b/compiler/luci-interpreter/pal/linux/PALMul.h index cfaec1b58..a8a9d4abc 100644 --- a/compiler/luci-interpreter/pal/linux/PALMul.h +++ b/compiler/luci-interpreter/pal/linux/PALMul.h @@ -21,21 +21,31 @@ namespace luci_interpreter_pal { +template <typename T> static inline void Mul(tflite::ArithmeticParams ¶ms, const tflite::RuntimeShape &input1_shape, - const float *input1_data, const tflite::RuntimeShape &input2_shape, - const float *input2_data, const tflite::RuntimeShape &output_shape, - float *output_data) + const T *input1_data, const tflite::RuntimeShape &input2_shape, + const T *input2_data, const tflite::RuntimeShape &output_shape, + T *output_data) { tflite::optimized_ops::Mul(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); } -static inline void BroadcastMul4DSlow(tflite::ArithmeticParams ¶ms, - const tflite::RuntimeShape &input1_shape, - const float *input1_data, - const tflite::RuntimeShape &input2_shape, - const float *input2_data, - const tflite::RuntimeShape &output_shape, float *output_data) +template <> +inline void Mul(tflite::ArithmeticParams ¶ms, const tflite::RuntimeShape &input1_shape, + const int64_t *input1_data, const tflite::RuntimeShape &input2_shape, + const int64_t *input2_data, const tflite::RuntimeShape &output_shape, + int64_t *output_data) +{ + tflite::optimized_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data); +} + +template <typename T> +static inline void +BroadcastMul4DSlow(tflite::ArithmeticParams ¶ms, const tflite::RuntimeShape &input1_shape, + const T *input1_data, const tflite::RuntimeShape &input2_shape, + const T *input2_data, const tflite::RuntimeShape &output_shape, T *output_data) { tflite::optimized_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); diff --git a/compiler/luci-interpreter/pal/linux/PALQuantize.h b/compiler/luci-interpreter/pal/linux/PALQuantize.h new file mode 100644 index 000000000..bf1d7954e --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/PALQuantize.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_QUANTIZE_H +#define LUCI_INTERPRETER_PAL_QUANTIZE_H + +#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void Quantize(tflite::QuantizationParams ¶ms, + const tflite::RuntimeShape &input_shape, const float *input_data, + const tflite::RuntimeShape &output_shape, T *output_data) +{ + tflite::optimized_ops::AffineQuantize(params, input_shape, input_data, output_shape, output_data); +} + +template <typename Input, typename Output> +static inline void Requantize(const Input *input_data, int32_t size, + int32_t effective_scale_multiplier, int32_t effective_scale_shift, + int32_t input_zero_point, int32_t output_zero_point, + Output *output_data) +{ + tflite::optimized_ops::Requantize(input_data, size, effective_scale_multiplier, + effective_scale_shift, input_zero_point, output_zero_point, + output_data); +} +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_QUANTIZE_H diff --git a/compiler/luci-interpreter/pal/linux/PALSVDF.h b/compiler/luci-interpreter/pal/linux/PALSVDF.h new file mode 100644 index 000000000..0ffba14f0 --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/PALSVDF.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_SVDF_H +#define LUCI_INTERPRETER_PAL_SVDF_H + +#include <tensorflow/lite/kernels/internal/reference/svdf.h> + +namespace luci_interpreter_pal +{ +static inline void +IntegerSVDF(const TfLiteSVDFParams ¶ms, const tflite::RuntimeShape &input_shape, + const int8_t *input_data, const tflite::RuntimeShape &weight_feature_shape, + const int8_t *weight_feature_data, const tflite::RuntimeShape &weight_time_shape, + const int16_t *weight_time_data, const tflite::RuntimeShape &bias_shape, + const int32_t *bias_data, int16_t *activation_state_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, int32_t *scratchpad_data, + int32_t *output_temp_data, int32_t scale_1_a, int scale_1_b, int32_t scale_2_a, + int scale_2_b, int32_t input_zp, int32_t output_zp) +{ + tflite::reference_ops::EvalIntegerSVDF(¶ms, input_shape, input_data, weight_feature_shape, + weight_feature_data, weight_time_shape, weight_time_data, + bias_shape, bias_data, activation_state_data, output_shape, + output_data, scratchpad_data, output_temp_data, scale_1_a, + scale_1_b, scale_2_a, scale_2_b, input_zp, output_zp); +} +static inline void +FloatSVDF(const TfLiteSVDFParams ¶ms, const tflite::RuntimeShape &input_shape, + const float *input_data, const tflite::RuntimeShape &weight_feature_shape, + const float *weight_feature_data, const tflite::RuntimeShape &weight_time_shape, + const float *weight_time_data, const tflite::RuntimeShape &bias_shape, + const float *bias_data, float *scratchpad_data, float *activation_state_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + tflite::reference_ops::EvalFloatSVDF(¶ms, input_shape, input_data, weight_feature_shape, + weight_feature_data, weight_time_shape, weight_time_data, + bias_shape, bias_data, scratchpad_data, + activation_state_data, output_shape, output_data); +} + +static inline void SetupScratchpadTensor( + const luci_interpreter::DataType &input_data_type, + const luci_interpreter::DataType &weight_feature_data_type, + luci_interpreter::Tensor *scratchpad_1, luci_interpreter::Tensor *scratchpad_2, + luci_interpreter::Tensor *scratchpad_3, luci_interpreter::Tensor *scratchpad_4, + luci_interpreter::Tensor *scratchpad_5, luci_interpreter::Tensor *scratchpad_6, + const luci_interpreter::Shape input_shape, const luci_interpreter::Shape weight_time_shape, + const int32_t batch_size, const int32_t num_filters, const int32_t num_units) +{ + + if (input_data_type == loco::DataType::FLOAT32 && + (weight_feature_data_type == loco::DataType::S8 || + weight_feature_data_type == loco::DataType::U8)) + { + (void)input_shape; + (void)weight_time_shape; + (void)scratchpad_3; + (void)scratchpad_4; + (void)scratchpad_5; + (void)scratchpad_6; + + throw std::runtime_error("Hybrid type is not currently supported for linux platform"); + } + + // Resize scratchpad_1 tensor + scratchpad_1->resize({batch_size, num_filters}); + + if (input_data_type == loco::DataType::S8) + { + // Resize scratchpad_2 for full_integer op + scratchpad_2->resize({batch_size, num_units}); + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_SVDF_H diff --git a/compiler/luci-interpreter/pal/linux/pal.cmake b/compiler/luci-interpreter/pal/linux/pal.cmake index 84349e0bf..185700cf9 100644 --- a/compiler/luci-interpreter/pal/linux/pal.cmake +++ b/compiler/luci-interpreter/pal/linux/pal.cmake @@ -40,7 +40,35 @@ macro(add_pal_to_target TGT) # TODO put it back, I changed my mind. # instead add sources with visitors in this library - set(PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc) + set(PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/tensor_utils.cc + ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc + ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc) + + if(BUILD_ARM32_NEON) + # NOTE may need to revise this list for version upgrade + set(PAL_SOURCES ${PAL_SOURCES} + ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc + ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/optimized/cpu_check.cc + ${TensorFlowRuySource_DIR}/ruy/allocator.cc + ${TensorFlowRuySource_DIR}/ruy/block_map.cc + ${TensorFlowRuySource_DIR}/ruy/blocking_counter.cc + ${TensorFlowRuySource_DIR}/ruy/context_get_ctx.cc + ${TensorFlowRuySource_DIR}/ruy/cpuinfo.cc + ${TensorFlowRuySource_DIR}/ruy/ctx.cc + ${TensorFlowRuySource_DIR}/ruy/denormal.cc + ${TensorFlowRuySource_DIR}/ruy/frontend.cc + ${TensorFlowRuySource_DIR}/ruy/pack_arm.cc + ${TensorFlowRuySource_DIR}/ruy/prepacked_cache.cc + ${TensorFlowRuySource_DIR}/ruy/prepare_packed_matrices.cc + ${TensorFlowRuySource_DIR}/ruy/system_aligned_alloc.cc + ${TensorFlowRuySource_DIR}/ruy/thread_pool.cc + ${TensorFlowRuySource_DIR}/ruy/trmul.cc + ${TensorFlowRuySource_DIR}/ruy/tune.cc + ${TensorFlowRuySource_DIR}/ruy/wait.cc + ${TensorFlowRuySource_DIR}/ruy/kernel_arm32.cc + ) + endif(BUILD_ARM32_NEON) + add_library(luci_interpreter_linux_pal STATIC ${PAL_SOURCES}) set_target_properties(luci_interpreter_linux_pal PROPERTIES POSITION_INDEPENDENT_CODE ON) target_include_directories(luci_interpreter_linux_pal SYSTEM PRIVATE diff --git a/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst b/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst index 771974afe..d134a6b95 100644 --- a/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst +++ b/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst @@ -7,9 +7,11 @@ REGISTER_KERNEL(Concatenation) REGISTER_KERNEL(Conv2D) REGISTER_KERNEL(DepthToSpace) REGISTER_KERNEL(DepthwiseConv2D) +REGISTER_KERNEL(Dequantize) REGISTER_KERNEL(Div) REGISTER_KERNEL(Elu) REGISTER_KERNEL(Exp) +REGISTER_KERNEL(ExpandDims) REGISTER_KERNEL(Floor) REGISTER_KERNEL(FloorDiv) REGISTER_KERNEL(Equal) @@ -37,6 +39,7 @@ REGISTER_KERNEL(NotEqual) REGISTER_KERNEL(Pad) REGISTER_KERNEL(PadV2) REGISTER_KERNEL(PRelu) +REGISTER_KERNEL(Quantize) REGISTER_KERNEL(Reshape) REGISTER_KERNEL(ResizeBilinear) REGISTER_KERNEL(ResizeNearestNeighbor) @@ -50,6 +53,7 @@ REGISTER_KERNEL(Square) REGISTER_KERNEL(SquaredDifference) REGISTER_KERNEL(Squeeze) REGISTER_KERNEL(Sub) +REGISTER_KERNEL(SVDF) REGISTER_KERNEL(Tanh) REGISTER_KERNEL(Transpose) REGISTER_KERNEL(TransposeConv) diff --git a/compiler/luci-interpreter/pal/mcu/PALAveragePool2d.h b/compiler/luci-interpreter/pal/mcu/PALAveragePool2d.h new file mode 100644 index 000000000..cce30601f --- /dev/null +++ b/compiler/luci-interpreter/pal/mcu/PALAveragePool2d.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H +#define LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H + +#include <tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h> +#include <tensorflow/lite/kernels/internal/reference/pooling.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void AveragePool(const tflite::PoolParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &output_shape, T *output_data, + const tflite::RuntimeShape &scratchpad_shape, T *scratchpad_data) +{ + { + // MARK: At this moment this operation doesn't support + assert(false && "AveragePool NYI"); + (void)params; + (void)input_shape; + (void)input_data; + (void)output_shape; + (void)output_data; + (void)scratchpad_shape; + (void)scratchpad_data; + } +} + +template <> +inline void AveragePool<int8_t>(const tflite::PoolParams ¶ms, + const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, + const tflite::RuntimeShape &scratchpad_shape, + int8_t *scratchpad_data) +{ + (void)scratchpad_shape; + (void)scratchpad_data; + + tflite::reference_integer_ops::AveragePool(params, input_shape, input_data, output_shape, + output_data); +} + +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const luci_interpreter::DataType &input_data_type, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &output_shape) + +{ + (void)input_data_type; + (void)input_shape; + (void)output_shape; + + scratchpad->set_allocatable(false); +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H diff --git a/compiler/luci-interpreter/pal/mcu/PALConv2d.h b/compiler/luci-interpreter/pal/mcu/PALConv2d.h index 0a8ae4e48..13976877a 100644 --- a/compiler/luci-interpreter/pal/mcu/PALConv2d.h +++ b/compiler/luci-interpreter/pal/mcu/PALConv2d.h @@ -26,11 +26,11 @@ static inline void Conv(const tflite::ConvParams ¶ms, const tflite::RuntimeS const float *input_data, const tflite::RuntimeShape &filter_shape, const float *filter_data, const tflite::RuntimeShape &bias_shape, const float *bias_data, const tflite::RuntimeShape &output_shape, - float *output_data, const tflite::RuntimeShape &im2col_shape, - float *im2col_data) + float *output_data, const tflite::RuntimeShape &scratchpad_shape, + float *scratchpad_data) { - (void)im2col_shape; - (void)im2col_data; + (void)scratchpad_shape; + (void)scratchpad_data; tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data, tflite::RuntimeShape(), nullptr); @@ -40,14 +40,14 @@ static inline void Conv(const tflite::ConvParams ¶ms, const tflite::RuntimeS const uint8 *input_data, const tflite::RuntimeShape &filter_shape, const uint8 *filter_data, const tflite::RuntimeShape &bias_shape, const int32 *bias_data, const tflite::RuntimeShape &output_shape, - uint8 *output_data, const tflite::RuntimeShape &im2col_shape, - uint8 *im2col_data) + uint8 *output_data, const tflite::RuntimeShape &scratchpad_shape, + uint8 *scratchpad_data) { - (void)im2col_shape; - (void)im2col_data; + (void)scratchpad_shape; + (void)scratchpad_data; tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data, - bias_shape, bias_data, output_shape, output_data, im2col_shape, - im2col_data, nullptr); + bias_shape, bias_data, output_shape, output_data, scratchpad_shape, + scratchpad_data, nullptr); } static inline void ConvPerChannel(const tflite::ConvParams ¶ms, const int32_t *mult, @@ -55,16 +55,31 @@ static inline void ConvPerChannel(const tflite::ConvParams ¶ms, const int32_ const int8 *input_data, const tflite::RuntimeShape &filter_shape, const int8 *filter_data, const tflite::RuntimeShape &bias_shape, const int32 *bias_data, const tflite::RuntimeShape &output_shape, - int8 *output_data, const tflite::RuntimeShape &im2col_shape, - int8 *im2col_data) + int8 *output_data, const tflite::RuntimeShape &scratchpad_shape, + int8 *scratchpad_data) { - (void)im2col_shape; - (void)im2col_data; + (void)scratchpad_shape; + (void)scratchpad_data; tflite::reference_integer_ops::ConvPerChannel(params, mult, shifts, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data); } +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const luci_interpreter::DataType &input_data_type, + const tflite::ConvParams ¶ms, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &filter_shape, + const tflite::RuntimeShape &output_shape) +{ + (void)input_data_type; + (void)params; + (void)input_shape; + (void)filter_shape; + (void)output_shape; + scratchpad->set_allocatable(false); +} + } // namespace luci_interpreter_pal #endif // LUCI_INTERPRETER_PAL_CONV2D_H diff --git a/compiler/luci-interpreter/pal/mcu/PALDepthwiseConv2d.h b/compiler/luci-interpreter/pal/mcu/PALDepthwiseConv2d.h new file mode 100644 index 000000000..c9d1a2948 --- /dev/null +++ b/compiler/luci-interpreter/pal/mcu/PALDepthwiseConv2d.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H +#define LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H + +#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h> +#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h> +#include <tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void +DepthwiseConvPerChannel(const tflite::DepthwiseParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const tflite::RuntimeShape &input_shape, + const T *input_data, const tflite::RuntimeShape &filter_shape, + const T *filter_data, const tflite::RuntimeShape &bias_shape, + const int32_t *bias_data, const tflite::RuntimeShape &output_shape, + T *output_data, const tflite::RuntimeShape &scratchpad_shape, + T *scratchpad_data) +{ + { + // MARK: At this moment this operation is not supported + assert(false && "DepthwiseConvPerChannel NYI"); + (void)params; + (void)output_multiplier; + (void)output_shift; + (void)input_shape; + (void)output_data; + (void)input_data; + (void)filter_shape; + (void)filter_data; + (void)bias_shape; + (void)bias_data; + (void)output_shape; + (void)output_data; + (void)scratchpad_shape; + (void)scratchpad_data; + } +} + +template <> +inline void DepthwiseConvPerChannel<int8_t>( + const tflite::DepthwiseParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, + const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data) +{ + (void)scratchpad_shape; + (void)scratchpad_data; + tflite::reference_integer_ops::DepthwiseConvPerChannel( + params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data); +} + +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const tflite::DepthwiseParams ¶ms, + const luci_interpreter::DataType &input_data_type, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &filter_shape, + const tflite::RuntimeShape &output_shape) + +{ + (void)params; + (void)input_data_type; + (void)input_shape; + (void)filter_shape; + (void)output_shape; + + scratchpad->set_allocatable(false); +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H diff --git a/compiler/luci-interpreter/pal/mcu/PALDequantize.h b/compiler/luci-interpreter/pal/mcu/PALDequantize.h new file mode 100644 index 000000000..15ff0327b --- /dev/null +++ b/compiler/luci-interpreter/pal/mcu/PALDequantize.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_DEQUANTIZE_H +#define LUCI_INTERPRETER_PAL_DEQUANTIZE_H + +#include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h" +#include "tensorflow/lite/kernels/internal/reference/reference_ops.h" + +namespace luci_interpreter_pal +{ + +template <typename T> +static inline void Dequantize(tflite::DequantizationParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + tflite::reference_integer_ops::Dequantize<T>(params, input_shape, input_data, output_shape, + output_data); +} + +static inline void Dequantize(tflite::DequantizationParams ¶ms, + const tflite::RuntimeShape &input_shape, const uint8_t *input_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + tflite::reference_ops::Dequantize(params, input_shape, input_data, output_shape, output_data); +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_DEQUANTIZE_H diff --git a/compiler/luci-interpreter/pal/mcu/PALFullyConnected.h b/compiler/luci-interpreter/pal/mcu/PALFullyConnected.h new file mode 100644 index 000000000..048624d74 --- /dev/null +++ b/compiler/luci-interpreter/pal/mcu/PALFullyConnected.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_FULLYCONNECTED_H +#define LUCI_INTERPRETER_PAL_FULLYCONNECTED_H + +#include <tensorflow/lite/kernels/internal/reference/fully_connected.h> +#include <tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void FullyConnected(const tflite::FullyConnectedParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &filter_shape, const T *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, T *output_data) +{ + { + // MARK: At this moment this operation is not supported + assert(false && "FullyConnected NYI"); + (void)params; + (void)input_shape; + (void)input_data; + (void)filter_shape; + (void)filter_data; + (void)bias_shape; + (void)bias_data; + (void)output_shape; + (void)output_data; + } +} + +template <> +inline void +FullyConnected<int8_t>(const tflite::FullyConnectedParams ¶ms, + const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data) +{ + tflite::reference_integer_ops::FullyConnected(params, input_shape, input_data, filter_shape, + filter_data, bias_shape, bias_data, output_shape, + output_data); +} +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_FULLYCONNECTED_H diff --git a/compiler/luci-interpreter/pal/mcu/PALMul.h b/compiler/luci-interpreter/pal/mcu/PALMul.h index 2b46b100c..347a97a83 100644 --- a/compiler/luci-interpreter/pal/mcu/PALMul.h +++ b/compiler/luci-interpreter/pal/mcu/PALMul.h @@ -21,21 +21,21 @@ namespace luci_interpreter_pal { +template <typename T> static inline void Mul(tflite::ArithmeticParams ¶ms, const tflite::RuntimeShape &input1_shape, - const float *input1_data, const tflite::RuntimeShape &input2_shape, - const float *input2_data, const tflite::RuntimeShape &output_shape, - float *output_data) + const T *input1_data, const tflite::RuntimeShape &input2_shape, + const T *input2_data, const tflite::RuntimeShape &output_shape, + T *output_data) { tflite::reference_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); } -static inline void BroadcastMul4DSlow(tflite::ArithmeticParams ¶ms, - const tflite::RuntimeShape &input1_shape, - const float *input1_data, - const tflite::RuntimeShape &input2_shape, - const float *input2_data, - const tflite::RuntimeShape &output_shape, float *output_data) +template <typename T> +static inline void +BroadcastMul4DSlow(tflite::ArithmeticParams ¶ms, const tflite::RuntimeShape &input1_shape, + const T *input1_data, const tflite::RuntimeShape &input2_shape, + const T *input2_data, const tflite::RuntimeShape &output_shape, T *output_data) { tflite::reference_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); diff --git a/compiler/luci-interpreter/pal/mcu/PALQuantize.h b/compiler/luci-interpreter/pal/mcu/PALQuantize.h new file mode 100644 index 000000000..6046789ae --- /dev/null +++ b/compiler/luci-interpreter/pal/mcu/PALQuantize.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_QUANTIZE_H +#define LUCI_INTERPRETER_PAL_QUANTIZE_H + +#include "tensorflow/lite/kernels/internal/reference/reference_ops.h" + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void Quantize(tflite::QuantizationParams ¶ms, + const tflite::RuntimeShape &input_shape, const float *input_data, + const tflite::RuntimeShape &output_shape, T *output_data) +{ + tflite::reference_ops::AffineQuantize(params, input_shape, input_data, output_shape, output_data); +} + +template <typename Input, typename Output> +static inline void Requantize(const Input *input_data, int32_t size, + int32_t effective_scale_multiplier, int32_t effective_scale_shift, + int32_t input_zero_point, int32_t output_zero_point, + Output *output_data) +{ + tflite::reference_ops::Requantize(input_data, size, effective_scale_multiplier, + effective_scale_shift, input_zero_point, output_zero_point, + output_data); +} +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_QUANTIZE_H diff --git a/compiler/luci-interpreter/pal/mcu/PALSVDF.h b/compiler/luci-interpreter/pal/mcu/PALSVDF.h new file mode 100644 index 000000000..3bba668fb --- /dev/null +++ b/compiler/luci-interpreter/pal/mcu/PALSVDF.h @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2020 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_SVDF_H +#define LUCI_INTERPRETER_PAL_SVDF_H + +#include <tensorflow/lite/kernels/internal/reference/svdf.h> + +namespace luci_interpreter_pal +{ +static inline void +IntegerSVDF(const TfLiteSVDFParams ¶ms, const tflite::RuntimeShape &input_shape, + const int8_t *input_data, const tflite::RuntimeShape &weight_feature_shape, + const int8_t *weight_feature_data, const tflite::RuntimeShape &weight_time_shape, + const int16_t *weight_time_data, const tflite::RuntimeShape &bias_shape, + const int32_t *bias_data, int16_t *activation_state_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, int32_t *scratchpad_data, + int32_t *output_temp_data, int32_t scale_1_a, int scale_1_b, int32_t scale_2_a, + int scale_2_b, int32_t input_zp, int32_t output_zp) +{ + const int n_rank = params.rank; + const int n_batch = input_shape.Dims(0); + const int n_input = input_shape.Dims(1); + const int n_filter = weight_feature_shape.Dims(0); + const int n_unit = n_filter / n_rank; + const int n_memory = weight_time_shape.Dims(1); + + // Left shift the activation_state. + { + int16_t *new_state_start = activation_state_data; + const int16_t *old_state_start = activation_state_data + 1; + const int16_t *old_state_end = activation_state_data + n_batch * n_filter * n_memory; + while (old_state_start != old_state_end) + { + *new_state_start++ = *old_state_start++; + } + } + + // Note: no need to clear the latest activation, matmul is not accumulative. + + // Feature matmul. + { + const int32_t output_max = std::numeric_limits<int16_t>::max(); + const int32_t output_min = std::numeric_limits<int16_t>::min(); + int16_t *result_in_batch = activation_state_data + (n_memory - 1); + for (int b = 0; b < n_batch; b++) + { + const int8_t *matrix_ptr = weight_feature_data; + for (int r = 0; r < n_filter; r++) + { + int32_t dot_prod = 0; + const int8_t *vector_in_batch = input_data + b * n_input; + for (int c = 0; c < n_input; c++) + { + dot_prod += *matrix_ptr++ * (*vector_in_batch++ - input_zp); + } + dot_prod = tflite::MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b); + dot_prod = std::min(std::max(output_min, dot_prod), output_max); + // This assumes state is symmetrically quantized. Otherwise last bit of + // state should be initialized to its zero point and accumulate the + // dot_prod. + // Equivalent as the following: + // result_in_batch = zero point, which happens to be zero. + // result_in_batch += dot_prod_56. + *result_in_batch = dot_prod; + result_in_batch += n_memory; + } + } + } + + // Time. + { + for (int b = 0; b < n_batch; ++b) + { + int32_t *scratch_ptr_batch = scratchpad_data + b * n_filter; + + // Perform batched vector dot product: + const int16_t *vector1_ptr = weight_time_data; + const int16_t *vector2_ptr = activation_state_data + b * n_memory * n_filter; + + for (int i = 0; i < n_filter; i++) + { + *scratch_ptr_batch = 0; + for (int j = 0; j < n_memory; j++) + { + *scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++; + } + scratch_ptr_batch++; + } + } + } + + // Reduce, add bias, rescale, activation. + { + // Add bias. + if (bias_data) + { + // Vector batch assign: + for (int i = 0; i < n_batch; ++i) + { + int32_t *output_ptr = output_temp_data + i * n_unit; + const int32_t *bias_ptr = bias_data; + for (int j = 0; j < n_unit; ++j) + { + *output_ptr++ = *bias_ptr++; + } + } + } + else + { + int32_t *output_ptr = output_temp_data; + for (int i = 0; i < n_batch * n_unit; ++i) + { + *output_ptr++ = 0; + } + } + + // Reduce. + for (int b = 0; b < n_batch; ++b) + { + int32_t *output_temp_ptr = output_temp_data + b * n_unit; + int32_t *scratch_ptr_batch = scratchpad_data + b * n_filter; + + // Reduction sum vector + for (int i = 0; i < n_unit; ++i) + { + for (int j = 0; j < n_rank; ++j) + { + output_temp_ptr[i] += *scratch_ptr_batch++; + } + } + } + + // Rescale. + const int32_t output_max = std::numeric_limits<int8_t>::max(); + const int32_t output_min = std::numeric_limits<int8_t>::min(); + for (int i = 0; i < n_batch * n_unit; ++i) + { + int32_t x1 = output_temp_data[i]; + int32_t x2 = tflite::MultiplyByQuantizedMultiplier(x1, scale_2_a, scale_2_b); + int32_t x3 = x2 + output_zp; + int32_t x4 = std::min(std::max(output_min, x3), output_max); + output_data[i] = static_cast<int8_t>(x4); + } + } +} +static inline void +FloatSVDF(const TfLiteSVDFParams ¶ms, const tflite::RuntimeShape &input_shape, + const float *input_data, const tflite::RuntimeShape &weight_feature_shape, + const float *weight_feature_data, const tflite::RuntimeShape &weight_time_shape, + const float *weight_time_data, const tflite::RuntimeShape &bias_shape, + const float *bias_data, float *scratchpad_data, float *activation_state_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + const int32_t rank = params.rank; + const int32_t batch_size = input_shape.Dims(0); + const int32_t input_size = input_shape.Dims(1); + const int32_t num_filters = weight_feature_shape.Dims(0); + const int32_t num_units = num_filters / rank; + const int32_t memory_size = weight_time_shape.Dims(1); + + // Left shift the activation_state. + { + float *new_state_start = activation_state_data; + const float *old_state_start = activation_state_data + 1; + const float *old_state_end = activation_state_data + batch_size * num_filters * memory_size; + while (old_state_start != old_state_end) + { + *new_state_start++ = *old_state_start++; + } + } + + // Note: no need to clear the latest activation, matmul is not accumulative. + + // Compute conv1d(inputs, weights_feature). + // The activation_state's rightmost column is used to save current cycle + // activation. This is achieved by starting at state_ptr[memory_size - 1] and + // having the stride equal to memory_size. + + // Perform batched matrix vector multiply operation: + { + const float *matrix = weight_feature_data; + const float *vector = input_data; + float *result = &activation_state_data[memory_size - 1]; + float *result_in_batch = result; + for (int i = 0; i < batch_size; ++i) + { + const float *matrix_ptr = matrix; + for (int j = 0; j < num_filters; ++j) + { + float dot_prod = 0.0f; + const float *vector_in_batch = vector + i * input_size; + for (int k = 0; k < input_size; ++k) + { + dot_prod += *matrix_ptr++ * *vector_in_batch++; + } + *result_in_batch = dot_prod; + result_in_batch += memory_size; + } + } + } + + tflite::reference_ops::ApplyTimeWeightsBiasAndActivation( + batch_size, memory_size, num_filters, num_units, rank, weight_time_data, bias_data, + params.activation, activation_state_data, scratchpad_data, output_data); +} + +static inline void SetupScratchpadTensor( + const luci_interpreter::DataType &input_data_type, + const luci_interpreter::DataType &weight_feature_data_type, + luci_interpreter::Tensor *scratchpad_1, luci_interpreter::Tensor *scratchpad_2, + luci_interpreter::Tensor *scratchpad_3, luci_interpreter::Tensor *scratchpad_4, + luci_interpreter::Tensor *scratchpad_5, luci_interpreter::Tensor *scratchpad_6, + const luci_interpreter::Shape input_shape, const luci_interpreter::Shape weight_time_shape, + const int32_t batch_size, const int32_t num_filters, const int32_t num_units) +{ + + if (input_data_type == loco::DataType::FLOAT32 && + (weight_feature_data_type == loco::DataType::S8 || + weight_feature_data_type == loco::DataType::U8)) + { + (void)input_shape; + (void)weight_time_shape; + (void)scratchpad_3; + (void)scratchpad_4; + (void)scratchpad_5; + (void)scratchpad_6; + + throw std::runtime_error("Hybrid type is not currently supported for mcu platform"); + } + + // Resize scratchpad_1 tensor + scratchpad_1->resize({batch_size, num_filters}); + + if (input_data_type == loco::DataType::S8) + { + // Resize scratchpad_2 for full_integer op + scratchpad_2->resize({batch_size, num_units}); + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_SVDF_H diff --git a/compiler/luci-interpreter/pal/mcu/pal.cmake b/compiler/luci-interpreter/pal/mcu/pal.cmake index a479d407b..907d51de6 100644 --- a/compiler/luci-interpreter/pal/mcu/pal.cmake +++ b/compiler/luci-interpreter/pal/mcu/pal.cmake @@ -39,7 +39,9 @@ macro(add_pal_to_target TGT) # TODO put it back, I changed my mind. # instead add sources with visitors in this library - set(PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc) + set(PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc + ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/tensor_utils.cc + ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc) add_library(luci_interpreter_mcu_pal STATIC ${PAL_SOURCES}) set_target_properties(luci_interpreter_mcu_pal PROPERTIES POSITION_INDEPENDENT_CODE ON) target_include_directories(luci_interpreter_mcu_pal PRIVATE |