diff options
author | Chunseok Lee <chunseok.lee@samsung.com> | 2022-04-15 19:15:11 +0900 |
---|---|---|
committer | Chunseok Lee <chunseok.lee@samsung.com> | 2022-04-15 19:15:11 +0900 |
commit | 3ad689f0803519e343c36d5700646e86059df961 (patch) | |
tree | 862346c401a5577518fa7f042532aa931b53aa0e /compiler/luci-interpreter | |
parent | ac6e4dd7b480e83b586ef533d7b29a8a97eb48fe (diff) | |
download | nnfw-3ad689f0803519e343c36d5700646e86059df961.tar.gz nnfw-3ad689f0803519e343c36d5700646e86059df961.tar.bz2 nnfw-3ad689f0803519e343c36d5700646e86059df961.zip |
Imported Upstream version 1.20.0upstream/1.20.0submit/tizen/20220415.103159
Diffstat (limited to 'compiler/luci-interpreter')
149 files changed, 7681 insertions, 289 deletions
diff --git a/compiler/luci-interpreter/README.md b/compiler/luci-interpreter/README.md index 4a9a34e6d..77ec5c81c 100644 --- a/compiler/luci-interpreter/README.md +++ b/compiler/luci-interpreter/README.md @@ -111,7 +111,7 @@ Note that one memory manager could be shared between multiple interpreter instan List of predefined memory managers: - `SimpleMemoryManager` This is a simple wrapper around new/delete, default one. -- `TestMemoryManager` Memorizes all allocated memory and releases it in Manager desctuctor, used in kernel unit tests. +- `TestMemoryManager` Memorizes all allocated memory and releases it in Manager destructor, used in kernel unit tests. - `BuddyMemoryManager` Implements Buddy algorithm, uses external buffer for tensor data allocations, does not need new/delete. - `StaticMemoryManger` Uses precomputed memory allocation plan. Requires preparation with MemoryPlanner, but could reduce memory consumption in restricted environments (like MCUs). diff --git a/compiler/luci-interpreter/include/luci_interpreter/GraphBuilderRegistry.h b/compiler/luci-interpreter/include/luci_interpreter/GraphBuilderRegistry.h new file mode 100644 index 000000000..375b1ae20 --- /dev/null +++ b/compiler/luci-interpreter/include/luci_interpreter/GraphBuilderRegistry.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __LUCI_INTERPRETER_GRAPH_BUILDER_REGISTRY__ +#define __LUCI_INTERPRETER_GRAPH_BUILDER_REGISTRY__ + +#include <luci/Import/GraphBuilderRegistry.h> + +namespace luci_interpreter +{ + +/** + * @brief Creates and returns GraphBuilderSource, which allows to not copy constant buffers from + * model's file. + * + * @warning Use this source only in case when model's buffer alive longer than Interpreter. + */ +std::unique_ptr<luci::GraphBuilderSource> source_without_constant_copying(); + +} // namespace luci_interpreter + +#endif // __LUCI_INTERPRETER_GRAPH_BUILDER_REGISTRY__ diff --git a/compiler/luci-interpreter/include/luci_interpreter/Interpreter.h b/compiler/luci-interpreter/include/luci_interpreter/Interpreter.h index 7dee8a7f2..8e2f457a5 100644 --- a/compiler/luci-interpreter/include/luci_interpreter/Interpreter.h +++ b/compiler/luci-interpreter/include/luci_interpreter/Interpreter.h @@ -50,7 +50,9 @@ public: class Interpreter { public: - explicit Interpreter(const luci::Module *module, IMemoryManager *memory_manager = nullptr); + explicit Interpreter(const luci::Module *module); + + explicit Interpreter(const luci::Module *module, IMemoryManager *memory_manager); ~Interpreter(); @@ -69,7 +71,6 @@ private: // the order of deletion in the destructor std::unique_ptr<IMemoryManager> _default_memory_manager = nullptr; std::unique_ptr<class RuntimeModule> _runtime_module; - IMemoryManager *_memory_manager = nullptr; // Observer functionality support. std::unique_ptr<struct RuntimeToIR> _runtime_to_ir; diff --git a/compiler/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst b/compiler/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst index 771974afe..d134a6b95 100644 --- a/compiler/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst +++ b/compiler/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst @@ -7,9 +7,11 @@ REGISTER_KERNEL(Concatenation) REGISTER_KERNEL(Conv2D) REGISTER_KERNEL(DepthToSpace) REGISTER_KERNEL(DepthwiseConv2D) +REGISTER_KERNEL(Dequantize) REGISTER_KERNEL(Div) REGISTER_KERNEL(Elu) REGISTER_KERNEL(Exp) +REGISTER_KERNEL(ExpandDims) REGISTER_KERNEL(Floor) REGISTER_KERNEL(FloorDiv) REGISTER_KERNEL(Equal) @@ -37,6 +39,7 @@ REGISTER_KERNEL(NotEqual) REGISTER_KERNEL(Pad) REGISTER_KERNEL(PadV2) REGISTER_KERNEL(PRelu) +REGISTER_KERNEL(Quantize) REGISTER_KERNEL(Reshape) REGISTER_KERNEL(ResizeBilinear) REGISTER_KERNEL(ResizeNearestNeighbor) @@ -50,6 +53,7 @@ REGISTER_KERNEL(Square) REGISTER_KERNEL(SquaredDifference) REGISTER_KERNEL(Squeeze) REGISTER_KERNEL(Sub) +REGISTER_KERNEL(SVDF) REGISTER_KERNEL(Tanh) REGISTER_KERNEL(Transpose) REGISTER_KERNEL(TransposeConv) diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALAveragePool2d.h b/compiler/luci-interpreter/pal/cmsisnn/PALAveragePool2d.h new file mode 100644 index 000000000..a274afb7e --- /dev/null +++ b/compiler/luci-interpreter/pal/cmsisnn/PALAveragePool2d.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H +#define LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H + +#include <tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h> +#include <tensorflow/lite/kernels/internal/reference/pooling.h> +#include <arm_nn_types.h> +#include <arm_nnfunctions.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void AveragePool(const tflite::PoolParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &output_shape, T *output_data, + const tflite::RuntimeShape &scratchpad_shape, T *scratchpad_data) +{ + { + // MARK: At this moment this operation is not supported + assert(false && "AveragePool NYI"); + (void)params; + (void)input_shape; + (void)input_data; + (void)output_shape; + (void)output_data; + (void)scratchpad_shape; + (void)scratchpad_data; + } +} + +template <> +inline void AveragePool<int8_t>(const tflite::PoolParams ¶ms, + const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, + const tflite::RuntimeShape &scratchpad_shape, + int8_t *scratchpad_data) +{ + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + assert(scratchpad_data != nullptr); + + const int32_t batches = tflite::MatchingDim(input_shape, 0, output_shape, 0); + assert(batches == 1); + + const int depth = tflite::MatchingDim(input_shape, 3, output_shape, 3); + + cmsis_nn_dims input_dims; + input_dims.n = 1; + input_dims.h = input_shape.Dims(1); + input_dims.w = input_shape.Dims(2); + input_dims.c = depth; + + cmsis_nn_dims output_dims; + output_dims.n = 1; + output_dims.h = output_shape.Dims(1); + output_dims.w = output_shape.Dims(2); + output_dims.c = depth; + + cmsis_nn_pool_params pool_params; + pool_params.stride.h = params.stride_height; + pool_params.stride.w = params.stride_width; + pool_params.padding.h = params.padding_values.height; + pool_params.padding.w = params.padding_values.width; + pool_params.activation.min = params.quantized_activation_min; + pool_params.activation.max = params.quantized_activation_max; + + cmsis_nn_dims filter_dims; + filter_dims.n = 1; + filter_dims.h = params.filter_height; + filter_dims.w = params.filter_width; + filter_dims.c = 1; + + cmsis_nn_context ctx; + ctx.buf = scratchpad_data; + ctx.size = scratchpad_shape.Dims(0); + auto res = arm_avgpool_s8(&ctx, &pool_params, &input_dims, input_data, &filter_dims, &output_dims, + output_data); + assert(res == ARM_MATH_SUCCESS); +} + +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const luci_interpreter::DataType &input_data_type, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &output_shape) + +{ + if (input_data_type == luci_interpreter::DataType::S8) + { + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + const int32_t output_width = output_shape.Dims(2); + const int32_t depth = tflite::MatchingDim(input_shape, 3, output_shape, 3); + + const int32_t buf_size = arm_avgpool_s8_get_buffer_size(output_width, depth); + auto data_type_size = static_cast<int32_t>(luci_interpreter::getDataTypeSize(input_data_type)); + + luci_interpreter::Shape scratchpad_shape{buf_size * data_type_size}; + scratchpad->resize(scratchpad_shape); + } + else + { + scratchpad->set_allocatable(false); + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALConv2d.h b/compiler/luci-interpreter/pal/cmsisnn/PALConv2d.h index 0a8ae4e48..cfb84ea60 100644 --- a/compiler/luci-interpreter/pal/cmsisnn/PALConv2d.h +++ b/compiler/luci-interpreter/pal/cmsisnn/PALConv2d.h @@ -19,6 +19,8 @@ #include <tensorflow/lite/kernels/internal/reference/conv.h> #include <tensorflow/lite/kernels/internal/reference/integer_ops/conv.h> +#include <arm_nn_types.h> +#include <arm_nnfunctions.h> namespace luci_interpreter_pal { @@ -26,11 +28,11 @@ static inline void Conv(const tflite::ConvParams ¶ms, const tflite::RuntimeS const float *input_data, const tflite::RuntimeShape &filter_shape, const float *filter_data, const tflite::RuntimeShape &bias_shape, const float *bias_data, const tflite::RuntimeShape &output_shape, - float *output_data, const tflite::RuntimeShape &im2col_shape, - float *im2col_data) + float *output_data, const tflite::RuntimeShape &scratchpad_shape, + float *scratchpad_data) { - (void)im2col_shape; - (void)im2col_data; + (void)scratchpad_shape; + (void)scratchpad_data; tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data, tflite::RuntimeShape(), nullptr); @@ -40,14 +42,14 @@ static inline void Conv(const tflite::ConvParams ¶ms, const tflite::RuntimeS const uint8 *input_data, const tflite::RuntimeShape &filter_shape, const uint8 *filter_data, const tflite::RuntimeShape &bias_shape, const int32 *bias_data, const tflite::RuntimeShape &output_shape, - uint8 *output_data, const tflite::RuntimeShape &im2col_shape, - uint8 *im2col_data) + uint8 *output_data, const tflite::RuntimeShape &scratchpad_shape, + uint8 *scratchpad_data) { - (void)im2col_shape; - (void)im2col_data; + (void)scratchpad_shape; + (void)scratchpad_data; tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data, - bias_shape, bias_data, output_shape, output_data, im2col_shape, - im2col_data, nullptr); + bias_shape, bias_data, output_shape, output_data, scratchpad_shape, + scratchpad_data, nullptr); } static inline void ConvPerChannel(const tflite::ConvParams ¶ms, const int32_t *mult, @@ -55,14 +57,141 @@ static inline void ConvPerChannel(const tflite::ConvParams ¶ms, const int32_ const int8 *input_data, const tflite::RuntimeShape &filter_shape, const int8 *filter_data, const tflite::RuntimeShape &bias_shape, const int32 *bias_data, const tflite::RuntimeShape &output_shape, - int8 *output_data, const tflite::RuntimeShape &im2col_shape, - int8 *im2col_data) + int8 *output_data, const tflite::RuntimeShape &scratchpad_shape, + int8 *scratchpad_data) { - (void)im2col_shape; - (void)im2col_data; - tflite::reference_integer_ops::ConvPerChannel(params, mult, shifts, input_shape, input_data, - filter_shape, filter_data, bias_shape, bias_data, - output_shape, output_data); + if (scratchpad_data) + { + cmsis_nn_conv_params conv_params; + conv_params.dilation.h = params.dilation_height_factor; + conv_params.dilation.w = params.dilation_width_factor; + + assert(conv_params.dilation.h == 1); + assert(conv_params.dilation.w == 1); + + conv_params.input_offset = params.input_offset; + conv_params.output_offset = params.output_offset; + conv_params.stride.h = params.stride_height; + conv_params.stride.w = params.stride_width; + conv_params.padding.h = params.padding_values.height; + conv_params.padding.w = params.padding_values.width; + conv_params.activation.min = params.quantized_activation_min; + conv_params.activation.max = params.quantized_activation_max; + + cmsis_nn_per_channel_quant_params quant_params; + quant_params.multiplier = const_cast<int32_t *>(mult); + quant_params.shift = const_cast<int32_t *>(shifts); + + assert(conv_params.activation.min <= conv_params.activation.max); + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batch_size = tflite::MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = tflite::MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = tflite::MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) + { + assert(bias_shape.FlatSize() == output_depth); + } + + cmsis_nn_dims input_dims; + input_dims.n = batch_size; + input_dims.h = input_shape.Dims(1); + input_dims.w = input_shape.Dims(2); + input_dims.c = input_depth; + + cmsis_nn_dims filter_dims; + filter_dims.n = output_depth; + filter_dims.h = filter_shape.Dims(1); + filter_dims.w = filter_shape.Dims(2); + filter_dims.c = input_depth; + + cmsis_nn_dims bias_dims; + bias_dims.n = 1; + bias_dims.h = 1; + bias_dims.w = 1; + bias_dims.c = output_depth; + + cmsis_nn_dims output_dims; + output_dims.n = batch_size; + output_dims.h = output_shape.Dims(1); + output_dims.w = output_shape.Dims(2); + output_dims.c = output_depth; + + cmsis_nn_context ctx; + ctx.buf = scratchpad_data; + ctx.size = scratchpad_shape.Dims(0); + + auto res = arm_convolve_wrapper_s8(&ctx, &conv_params, &quant_params, &input_dims, input_data, + &filter_dims, filter_data, &bias_dims, bias_data, + &output_dims, output_data); + assert(res == ARM_MATH_SUCCESS); + } + else + { + tflite::reference_integer_ops::ConvPerChannel(params, mult, shifts, input_shape, input_data, + filter_shape, filter_data, bias_shape, bias_data, + output_shape, output_data); + } +} + +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const luci_interpreter::DataType &input_data_type, + const tflite::ConvParams ¶ms, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &filter_shape, + const tflite::RuntimeShape &output_shape) +{ + cmsis_nn_conv_params conv_params; + conv_params.dilation.h = params.dilation_height_factor; + conv_params.dilation.w = params.dilation_width_factor; + + if (input_data_type == loco::DataType::S8 && conv_params.dilation.h == 1 && + conv_params.dilation.w == 1) + { + const int32_t batches = tflite::MatchingDim(input_shape, 0, output_shape, 0); + const int32_t input_depth = tflite::MatchingDim(input_shape, 3, filter_shape, 3); + const int32_t output_depth = tflite::MatchingDim(filter_shape, 0, output_shape, 3); + const int32_t filter_height = filter_shape.Dims(1); + const int32_t filter_width = filter_shape.Dims(2); + const int32_t output_height = output_shape.Dims(1); + const int32_t output_width = output_shape.Dims(2); + + conv_params.input_offset = params.input_offset; + conv_params.output_offset = params.output_offset; + conv_params.stride.h = params.stride_height; + conv_params.stride.w = params.stride_width; + conv_params.padding.h = params.padding_values.height; + conv_params.padding.w = params.padding_values.width; + + cmsis_nn_dims input_dims; + input_dims.n = batches; + input_dims.h = input_shape.Dims(1); + input_dims.w = input_shape.Dims(2); + input_dims.c = input_depth; + + cmsis_nn_dims filter_dims; + filter_dims.n = output_depth; + filter_dims.h = filter_height; + filter_dims.w = filter_width; + filter_dims.c = input_depth; + + cmsis_nn_dims output_dims; + output_dims.n = batches; + output_dims.h = output_height; + output_dims.w = output_width; + output_dims.c = output_depth; + + const int32_t buf_size = arm_convolve_wrapper_s8_get_buffer_size(&conv_params, &input_dims, + &filter_dims, &output_dims); + + luci_interpreter::Shape scratchpad_shape{buf_size}; + scratchpad->resize(scratchpad_shape); + } + else + { + scratchpad->set_allocatable(false); + } } } // namespace luci_interpreter_pal diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALDepthwiseConv2d.h b/compiler/luci-interpreter/pal/cmsisnn/PALDepthwiseConv2d.h new file mode 100644 index 000000000..120dcd803 --- /dev/null +++ b/compiler/luci-interpreter/pal/cmsisnn/PALDepthwiseConv2d.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H +#define LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H + +#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h> +#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h> +#include <tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h> +#include <arm_nnfunctions.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void +DepthwiseConvPerChannel(const tflite::DepthwiseParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const tflite::RuntimeShape &input_shape, + const T *input_data, const tflite::RuntimeShape &filter_shape, + const T *filter_data, const tflite::RuntimeShape &bias_shape, + const int32_t *bias_data, const tflite::RuntimeShape &output_shape, + T *output_data, const tflite::RuntimeShape &scratchpad_shape, + T *scratchpad_data) +{ + { + // MARK: At this moment this operation is not supported + assert(false && "DepthwiseConvPerChannel NYI"); + (void)params; + (void)output_multiplier; + (void)output_shift; + (void)input_shape; + (void)output_data; + (void)input_data; + (void)filter_shape; + (void)filter_data; + (void)bias_shape; + (void)bias_data; + (void)output_shape; + (void)output_data; + (void)scratchpad_shape; + (void)scratchpad_data; + } +} + +template <> +inline void DepthwiseConvPerChannel<int8_t>( + const tflite::DepthwiseParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, + const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data) +{ + if (scratchpad_data) + { + cmsis_nn_dw_conv_params dw_conv_params; + dw_conv_params.dilation.h = params.dilation_height_factor; + dw_conv_params.dilation.w = params.dilation_width_factor; + assert(dw_conv_params.dilation.h == 1); + assert(dw_conv_params.dilation.w == 1); + + dw_conv_params.input_offset = params.input_offset; + dw_conv_params.output_offset = params.output_offset; + dw_conv_params.stride.h = params.stride_height; + dw_conv_params.stride.w = params.stride_width; + dw_conv_params.padding.h = params.padding_values.height; + dw_conv_params.padding.w = params.padding_values.width; + + dw_conv_params.activation.min = params.quantized_activation_min; + dw_conv_params.activation.max = params.quantized_activation_max; + dw_conv_params.ch_mult = params.depth_multiplier; + + cmsis_nn_per_channel_quant_params quant_params; + int32_t output_multiplier = params.output_multiplier; + int32_t output_shift = params.output_shift; + + quant_params.multiplier = &output_multiplier; + quant_params.shift = &output_shift; + + assert(dw_conv_params.activation.min <= dw_conv_params.activation.max); + const int batch_size = tflite::MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = tflite::MatchingDim(filter_shape, 3, output_shape, 3); + if (bias_data) + { + assert(bias_shape.FlatSize() == output_depth); + } + + cmsis_nn_dims input_dims; + input_dims.n = batch_size; + input_dims.h = input_shape.Dims(1); + input_dims.w = input_shape.Dims(2); + input_dims.c = input_shape.Dims(3); + + cmsis_nn_dims filter_dims; + filter_dims.n = filter_shape.Dims(0); + filter_dims.h = filter_shape.Dims(1); + filter_dims.w = filter_shape.Dims(2); + filter_dims.c = output_depth; + + cmsis_nn_dims bias_dims; + bias_dims.n = 1; + bias_dims.h = 1; + bias_dims.w = 1; + bias_dims.c = output_depth; + + cmsis_nn_dims output_dims; + output_dims.n = batch_size; + output_dims.h = output_shape.Dims(1); + output_dims.w = output_shape.Dims(2); + output_dims.c = output_depth; + + cmsis_nn_context ctx; + ctx.buf = scratchpad_data; + ctx.size = scratchpad_shape.Dims(0); + + auto res = arm_depthwise_conv_wrapper_s8(&ctx, &dw_conv_params, &quant_params, &input_dims, + input_data, &filter_dims, filter_data, &bias_dims, + bias_data, &output_dims, output_data); + assert(res == ARM_MATH_SUCCESS); + } + else + { + tflite::reference_integer_ops::DepthwiseConvPerChannel( + params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data); + } +} + +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const tflite::DepthwiseParams ¶ms, + const luci_interpreter::DataType &input_data_type, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &filter_shape, + const tflite::RuntimeShape &output_shape) +{ + cmsis_nn_dw_conv_params dw_conv_params; + dw_conv_params.dilation.h = params.dilation_height_factor; + dw_conv_params.dilation.w = params.dilation_width_factor; + + if (input_data_type == loco::DataType::S8 && dw_conv_params.dilation.h == 1 && + dw_conv_params.dilation.w == 1) + { + const int batch_size = tflite::MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = tflite::MatchingDim(filter_shape, 3, output_shape, 3); + + cmsis_nn_dims input_dims; + input_dims.n = batch_size; + input_dims.h = input_shape.Dims(1); + input_dims.w = input_shape.Dims(2); + input_dims.c = input_shape.Dims(3); + + cmsis_nn_dims filter_dims; + filter_dims.n = filter_shape.Dims(0); + filter_dims.h = filter_shape.Dims(1); + filter_dims.w = filter_shape.Dims(2); + filter_dims.c = output_depth; + + cmsis_nn_dims output_dims; + output_dims.n = batch_size; + output_dims.h = output_shape.Dims(1); + output_dims.w = output_shape.Dims(2); + output_dims.c = output_depth; + + const int32_t buf_size = arm_depthwise_conv_wrapper_s8_get_buffer_size( + &dw_conv_params, &input_dims, &filter_dims, &output_dims); + + auto data_type_size = static_cast<int32_t>(luci_interpreter::getDataTypeSize(input_data_type)); + + luci_interpreter::Shape scratchpad_shape{buf_size * data_type_size}; + scratchpad->resize(scratchpad_shape); + } + else + { + scratchpad->set_allocatable(false); + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALDequantize.h b/compiler/luci-interpreter/pal/cmsisnn/PALDequantize.h new file mode 100644 index 000000000..15ff0327b --- /dev/null +++ b/compiler/luci-interpreter/pal/cmsisnn/PALDequantize.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_DEQUANTIZE_H +#define LUCI_INTERPRETER_PAL_DEQUANTIZE_H + +#include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h" +#include "tensorflow/lite/kernels/internal/reference/reference_ops.h" + +namespace luci_interpreter_pal +{ + +template <typename T> +static inline void Dequantize(tflite::DequantizationParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + tflite::reference_integer_ops::Dequantize<T>(params, input_shape, input_data, output_shape, + output_data); +} + +static inline void Dequantize(tflite::DequantizationParams ¶ms, + const tflite::RuntimeShape &input_shape, const uint8_t *input_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + tflite::reference_ops::Dequantize(params, input_shape, input_data, output_shape, output_data); +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_DEQUANTIZE_H diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALFullyConnected.h b/compiler/luci-interpreter/pal/cmsisnn/PALFullyConnected.h new file mode 100644 index 000000000..32e905761 --- /dev/null +++ b/compiler/luci-interpreter/pal/cmsisnn/PALFullyConnected.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_FULLYCONNECTED_H +#define LUCI_INTERPRETER_PAL_FULLYCONNECTED_H + +#include <tensorflow/lite/kernels/internal/reference/fully_connected.h> +#include <tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h> +#include <arm_nnfunctions.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void FullyConnected(const tflite::FullyConnectedParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &filter_shape, const T *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, T *output_data) +{ + { + // MARK: At this moment this operation doesn't support + assert(false && "FullyConnected NYI"); + (void)params; + (void)input_shape; + (void)input_data; + (void)filter_shape; + (void)filter_data; + (void)bias_shape; + (void)bias_data; + (void)output_shape; + (void)output_data; + } +} + +template <> +inline void +FullyConnected<int8_t>(const tflite::FullyConnectedParams ¶ms, + const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data) +{ + assert(output_shape.DimensionsCount() == 2); + + const int batches = output_shape.Dims(0); + const int output_depth = output_shape.Dims(1); + + const int filter_dim_count = filter_shape.DimensionsCount(); + const int accum_depth = filter_shape.Dims(filter_dim_count - 1); + + cmsis_nn_fc_params fc_params; + fc_params.input_offset = params.input_offset; + fc_params.output_offset = params.output_offset; + fc_params.filter_offset = params.weights_offset; + fc_params.activation.min = params.quantized_activation_min; + fc_params.activation.max = params.quantized_activation_max; + + cmsis_nn_per_tensor_quant_params quant_params; + quant_params.multiplier = params.output_multiplier; + quant_params.shift = params.output_shift; + + cmsis_nn_dims input_dims; + input_dims.n = batches; + input_dims.h = 1; + input_dims.w = 1; + input_dims.c = accum_depth; + + cmsis_nn_dims filter_dims; + filter_dims.n = accum_depth; + filter_dims.h = 1; + filter_dims.w = 1; + filter_dims.c = output_depth; + + cmsis_nn_dims bias_dims; + bias_dims.n = 1; + bias_dims.h = 1; + bias_dims.w = 1; + bias_dims.c = output_depth; + + cmsis_nn_dims output_dims; + output_dims.n = batches; + output_dims.h = 1; + output_dims.w = 1; + output_dims.c = output_depth; + + int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims); + auto buffer = std::make_unique<int8_t[]>(buf_size); + assert(buffer != nullptr); + + cmsis_nn_context ctx; + ctx.buf = buffer.get(); + ctx.size = buf_size; + + auto res = + arm_fully_connected_s8(&ctx, &fc_params, &quant_params, &input_dims, input_data, &filter_dims, + filter_data, &bias_dims, bias_data, &output_dims, output_data); + assert(res == ARM_MATH_SUCCESS); +} +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_FULLYCONNECTED_H diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALMul.h b/compiler/luci-interpreter/pal/cmsisnn/PALMul.h index 2b46b100c..347a97a83 100644 --- a/compiler/luci-interpreter/pal/cmsisnn/PALMul.h +++ b/compiler/luci-interpreter/pal/cmsisnn/PALMul.h @@ -21,21 +21,21 @@ namespace luci_interpreter_pal { +template <typename T> static inline void Mul(tflite::ArithmeticParams ¶ms, const tflite::RuntimeShape &input1_shape, - const float *input1_data, const tflite::RuntimeShape &input2_shape, - const float *input2_data, const tflite::RuntimeShape &output_shape, - float *output_data) + const T *input1_data, const tflite::RuntimeShape &input2_shape, + const T *input2_data, const tflite::RuntimeShape &output_shape, + T *output_data) { tflite::reference_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); } -static inline void BroadcastMul4DSlow(tflite::ArithmeticParams ¶ms, - const tflite::RuntimeShape &input1_shape, - const float *input1_data, - const tflite::RuntimeShape &input2_shape, - const float *input2_data, - const tflite::RuntimeShape &output_shape, float *output_data) +template <typename T> +static inline void +BroadcastMul4DSlow(tflite::ArithmeticParams ¶ms, const tflite::RuntimeShape &input1_shape, + const T *input1_data, const tflite::RuntimeShape &input2_shape, + const T *input2_data, const tflite::RuntimeShape &output_shape, T *output_data) { tflite::reference_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALQuantize.h b/compiler/luci-interpreter/pal/cmsisnn/PALQuantize.h new file mode 100644 index 000000000..6046789ae --- /dev/null +++ b/compiler/luci-interpreter/pal/cmsisnn/PALQuantize.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_QUANTIZE_H +#define LUCI_INTERPRETER_PAL_QUANTIZE_H + +#include "tensorflow/lite/kernels/internal/reference/reference_ops.h" + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void Quantize(tflite::QuantizationParams ¶ms, + const tflite::RuntimeShape &input_shape, const float *input_data, + const tflite::RuntimeShape &output_shape, T *output_data) +{ + tflite::reference_ops::AffineQuantize(params, input_shape, input_data, output_shape, output_data); +} + +template <typename Input, typename Output> +static inline void Requantize(const Input *input_data, int32_t size, + int32_t effective_scale_multiplier, int32_t effective_scale_shift, + int32_t input_zero_point, int32_t output_zero_point, + Output *output_data) +{ + tflite::reference_ops::Requantize(input_data, size, effective_scale_multiplier, + effective_scale_shift, input_zero_point, output_zero_point, + output_data); +} +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_QUANTIZE_H diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALSVDF.h b/compiler/luci-interpreter/pal/cmsisnn/PALSVDF.h new file mode 100644 index 000000000..a4a5b2a78 --- /dev/null +++ b/compiler/luci-interpreter/pal/cmsisnn/PALSVDF.h @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2020 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_SVDF_H +#define LUCI_INTERPRETER_PAL_SVDF_H + +#include <arm_nn_types.h> +#include <arm_nnfunctions.h> + +namespace luci_interpreter_pal +{ +static inline void +IntegerSVDF(const TfLiteSVDFParams ¶ms, const tflite::RuntimeShape &input_shape, + const int8_t *input_data, const tflite::RuntimeShape &weight_feature_shape, + const int8_t *weight_feature_data, const tflite::RuntimeShape &weight_time_shape, + const int16_t *weight_time_data, const tflite::RuntimeShape &bias_shape, + const int32_t *bias_data, int16_t *activation_state_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, int32_t *scratchpad_data, + int32_t *output_temp_data, int32_t scale_1_a, int scale_1_b, int32_t scale_2_a, + int scale_2_b, int32_t input_zp, int32_t output_zp) +{ + const int32_t rank = params.rank; + const int32_t batch_size = input_shape.Dims(0); + const int32_t num_filters = weight_feature_shape.Dims(0); + const int32_t memory_size = weight_time_shape.Dims(1); + + cmsis_nn_dims input_dims; + input_dims.n = input_shape.Dims(0); + input_dims.h = input_shape.Dims(1); + + cmsis_nn_dims weights_feature_dims; + weights_feature_dims.n = weight_feature_shape.Dims(0); + weights_feature_dims.h = weight_feature_shape.Dims(1); + + cmsis_nn_dims weights_time_dims; + weights_time_dims.n = weight_time_shape.Dims(0); + weights_time_dims.h = weight_time_shape.Dims(1); + + cmsis_nn_dims bias_dims; + bias_dims.n = bias_shape.Dims(0); + + cmsis_nn_dims state_dims; + state_dims.n = batch_size; + state_dims.h = memory_size * num_filters; + + cmsis_nn_dims output_dims; + output_dims.n = output_shape.Dims(0); + output_dims.h = output_shape.Dims(1); + + cmsis_nn_svdf_params svdf_params; + svdf_params.rank = params.rank; + svdf_params.input_offset = input_zp; + svdf_params.output_offset = output_zp; + + svdf_params.input_activation.min = INT16_MIN; + svdf_params.input_activation.max = INT16_MAX; + + svdf_params.output_activation.min = INT8_MIN; + svdf_params.output_activation.max = INT8_MAX; + + cmsis_nn_per_tensor_quant_params in_quant_params; + in_quant_params.multiplier = scale_1_a; + in_quant_params.shift = scale_1_b; + + cmsis_nn_per_tensor_quant_params out_quant_params; + out_quant_params.multiplier = scale_2_a; + out_quant_params.shift = scale_2_b; + + cmsis_nn_context scratch_ctx; + scratch_ctx.buf = scratchpad_data; + + cmsis_nn_context scratch_output_ctx; + scratch_output_ctx.buf = output_temp_data; + + arm_svdf_s8(&scratch_ctx, &scratch_output_ctx, &svdf_params, &in_quant_params, &out_quant_params, + &input_dims, input_data, &state_dims, activation_state_data, &weights_feature_dims, + weight_feature_data, &weights_time_dims, weight_time_data, &bias_dims, bias_data, + &output_dims, output_data); +} +static inline void +FloatSVDF(const TfLiteSVDFParams ¶ms, const tflite::RuntimeShape &input_shape, + const float *input_data, const tflite::RuntimeShape &weight_feature_shape, + const float *weight_feature_data, const tflite::RuntimeShape &weight_time_shape, + const float *weight_time_data, const tflite::RuntimeShape &bias_shape, + const float *bias_data, float *scratchpad_data, float *activation_state_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + const int32_t rank = params.rank; + const int32_t batch_size = input_shape.Dims(0); + const int32_t input_size = input_shape.Dims(1); + const int32_t num_filters = weight_feature_shape.Dims(0); + const int32_t num_units = num_filters / rank; + const int32_t memory_size = weight_time_shape.Dims(1); + + // Left shift the activation_state. + { + float *new_state_start = activation_state_data; + const float *old_state_start = activation_state_data + 1; + const float *old_state_end = activation_state_data + batch_size * num_filters * memory_size; + while (old_state_start != old_state_end) + { + *new_state_start++ = *old_state_start++; + } + } + + // Note: no need to clear the latest activation, matmul is not accumulative. + + // Compute conv1d(inputs, weights_feature). + // The activation_state's rightmost column is used to save current cycle + // activation. This is achieved by starting at state_ptr[memory_size - 1] and + // having the stride equal to memory_size. + + // Perform batched matrix vector multiply operation: + { + const float *matrix = weight_feature_data; + const float *vector = input_data; + float *result = &activation_state_data[memory_size - 1]; + float *result_in_batch = result; + for (int i = 0; i < batch_size; ++i) + { + const float *matrix_ptr = matrix; + for (int j = 0; j < num_filters; ++j) + { + float dot_prod = 0.0f; + const float *vector_in_batch = vector + i * input_size; + for (int k = 0; k < input_size; ++k) + { + dot_prod += *matrix_ptr++ * *vector_in_batch++; + } + *result_in_batch = dot_prod; + result_in_batch += memory_size; + } + } + } + + tflite::reference_ops::ApplyTimeWeightsBiasAndActivation( + batch_size, memory_size, num_filters, num_units, rank, weight_time_data, bias_data, + params.activation, activation_state_data, scratchpad_data, output_data); +} + +static inline void SetupScratchpadTensor( + const luci_interpreter::DataType &input_data_type, + const luci_interpreter::DataType &weight_feature_data_type, + luci_interpreter::Tensor *scratchpad_1, luci_interpreter::Tensor *scratchpad_2, + luci_interpreter::Tensor *scratchpad_3, luci_interpreter::Tensor *scratchpad_4, + luci_interpreter::Tensor *scratchpad_5, luci_interpreter::Tensor *scratchpad_6, + const luci_interpreter::Shape input_shape, const luci_interpreter::Shape weight_time_shape, + const int32_t batch_size, const int32_t num_filters, const int32_t num_units) +{ + if (input_data_type == loco::DataType::FLOAT32 && + (weight_feature_data_type == loco::DataType::S8 || + weight_feature_data_type == loco::DataType::U8)) + { + (void)input_shape; + (void)weight_time_shape; + (void)scratchpad_3; + (void)scratchpad_4; + (void)scratchpad_5; + (void)scratchpad_6; + + throw std::runtime_error("Hybrid type is not supported for cmsisnn"); + } + + // Resize scratchpad_1 tensor + scratchpad_1->resize({batch_size, num_filters}); + + if (input_data_type == loco::DataType::S8) + { + // Resize scratchpad_2 for full_integer op + scratchpad_2->resize({batch_size, num_units}); + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_SVDF_H diff --git a/compiler/luci-interpreter/pal/cmsisnn/pal.cmake b/compiler/luci-interpreter/pal/cmsisnn/pal.cmake index 9a25a3c5d..a68b363d9 100644 --- a/compiler/luci-interpreter/pal/cmsisnn/pal.cmake +++ b/compiler/luci-interpreter/pal/cmsisnn/pal.cmake @@ -42,9 +42,12 @@ macro(add_pal_to_target TGT) "${TensorFlowSource_DIR}") target_include_directories(${TGT} PRIVATE ${LUCI_INTERPRETER_PAL_DIR}) - set(PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc) + file(GLOB_RECURSE PAL_SOURCES "${CMSISSource_DIR}/CMSIS/NN/Source/*.c") + list(APPEND PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc + ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/tensor_utils.cc + ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc) add_library(luci_interpreter_cmsisnn_pal STATIC ${PAL_SOURCES}) - set_target_properties(luci_interpreter_cmsisnn_pal PROPERTIES POSITION_INDEPENDENT_CODE ON) + set_property(TARGET luci_interpreter_cmsisnn_pal PROPERTY POSITION_INDEPENDENT_CODE ON) target_include_directories(luci_interpreter_cmsisnn_pal PRIVATE "${TensorFlowRuySource_DIR}" "${TensorFlowGEMMLowpSource_DIR}" @@ -53,7 +56,7 @@ macro(add_pal_to_target TGT) ) add_subdirectory(${CMSISSource_DIR}/CMSIS/NN ${CMAKE_CURRENT_BINARY_DIR}/CMSISNN) - target_include_directories(luci_interpreter_cmsisnn_pal PRIVATE + target_include_directories(luci_interpreter_cmsisnn_pal PUBLIC "${CMSISSource_DIR}/CMSIS/NN/Include" "${CMSISSource_DIR}/CMSIS/DSP/Include" "${CMSISSource_DIR}/CMSIS/Core/Include") diff --git a/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst b/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst index 9d541276c..428b15ee0 100644 --- a/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst +++ b/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst @@ -1,19 +1,23 @@ REGISTER_KERNEL(Add) REGISTER_KERNEL(ArgMax) REGISTER_KERNEL(AveragePool2D) +REGISTER_KERNEL(BatchMatMul) REGISTER_KERNEL(BatchToSpaceND) REGISTER_KERNEL(Cast) REGISTER_KERNEL(Concatenation) REGISTER_KERNEL(Conv2D) REGISTER_KERNEL(DepthToSpace) REGISTER_KERNEL(DepthwiseConv2D) +REGISTER_KERNEL(Dequantize) REGISTER_KERNEL(Div) REGISTER_KERNEL(Elu) REGISTER_KERNEL(Exp) +REGISTER_KERNEL(ExpandDims) REGISTER_KERNEL(Floor) REGISTER_KERNEL(FloorDiv) REGISTER_KERNEL(Equal) REGISTER_KERNEL(FullyConnected) +REGISTER_KERNEL(Gather) REGISTER_KERNEL(Greater) REGISTER_KERNEL(GreaterEqual) REGISTER_KERNEL(If) @@ -37,11 +41,13 @@ REGISTER_KERNEL(MirrorPad) REGISTER_KERNEL(Mul) REGISTER_KERNEL(Neg) REGISTER_KERNEL(NotEqual) +REGISTER_KERNEL(OneHot) REGISTER_KERNEL(Pack) REGISTER_KERNEL(Pad) REGISTER_KERNEL(PadV2) REGISTER_KERNEL(Pow) REGISTER_KERNEL(PRelu) +REGISTER_KERNEL(Quantize) REGISTER_KERNEL(Relu) REGISTER_KERNEL(Relu6) REGISTER_KERNEL(Reshape) @@ -61,6 +67,7 @@ REGISTER_KERNEL(Square) REGISTER_KERNEL(SquaredDifference) REGISTER_KERNEL(Squeeze) REGISTER_KERNEL(Sub) +REGISTER_KERNEL(SVDF) REGISTER_KERNEL(Tanh) REGISTER_KERNEL(Transpose) REGISTER_KERNEL(TransposeConv) diff --git a/compiler/luci-interpreter/pal/linux/PALAveragePool2d.h b/compiler/luci-interpreter/pal/linux/PALAveragePool2d.h new file mode 100644 index 000000000..cce30601f --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/PALAveragePool2d.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H +#define LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H + +#include <tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h> +#include <tensorflow/lite/kernels/internal/reference/pooling.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void AveragePool(const tflite::PoolParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &output_shape, T *output_data, + const tflite::RuntimeShape &scratchpad_shape, T *scratchpad_data) +{ + { + // MARK: At this moment this operation doesn't support + assert(false && "AveragePool NYI"); + (void)params; + (void)input_shape; + (void)input_data; + (void)output_shape; + (void)output_data; + (void)scratchpad_shape; + (void)scratchpad_data; + } +} + +template <> +inline void AveragePool<int8_t>(const tflite::PoolParams ¶ms, + const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, + const tflite::RuntimeShape &scratchpad_shape, + int8_t *scratchpad_data) +{ + (void)scratchpad_shape; + (void)scratchpad_data; + + tflite::reference_integer_ops::AveragePool(params, input_shape, input_data, output_shape, + output_data); +} + +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const luci_interpreter::DataType &input_data_type, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &output_shape) + +{ + (void)input_data_type; + (void)input_shape; + (void)output_shape; + + scratchpad->set_allocatable(false); +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H diff --git a/compiler/luci-interpreter/pal/linux/PALBatchMatMul.h b/compiler/luci-interpreter/pal/linux/PALBatchMatMul.h new file mode 100644 index 000000000..3894f2d92 --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/PALBatchMatMul.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_BATCHMATMUL_H +#define LUCI_INTERPRETER_PAL_BATCHMATMUL_H + +#include <tensorflow/lite/kernels/internal/reference/batch_matmul.h> + +namespace luci_interpreter_pal +{ +inline void BatchMatMul(const tflite::RuntimeShape &lhs_shape, const float *lhs_data, + const tflite::RuntimeShape &rhs_shape, const float *rhs_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + tflite::reference_ops::BatchMatMul(lhs_shape, lhs_data, rhs_shape, rhs_data, output_shape, + output_data); +} + +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *lhs_scratchpad, + luci_interpreter::Tensor *rhs_scratchpad, + const tflite::RuntimeShape &lhs_shape, + const tflite::RuntimeShape &rhs_shape) +{ + // Scratchpad for transposed LHS + { + auto lhs_rank = lhs_shape.DimensionsCount(); + luci_interpreter::Shape scratchpad_size(lhs_rank); + for (int i = 0; i < lhs_rank - 2; ++i) + { + scratchpad_size.dim(i) = lhs_shape.Dims(i); + } + scratchpad_size.dim(lhs_rank - 2) = lhs_shape.Dims(lhs_rank - 1); + scratchpad_size.dim(lhs_rank - 1) = lhs_shape.Dims(lhs_rank - 2); + + lhs_scratchpad->resize(scratchpad_size); + } + // Scratchpad for transposed RHS + { + auto rhs_rank = rhs_shape.DimensionsCount(); + luci_interpreter::Shape scratchpad_size(rhs_rank); + for (int i = 0; i < rhs_rank - 2; ++i) + { + scratchpad_size.dim(i) = rhs_shape.Dims(i); + } + scratchpad_size.dim(rhs_rank - 2) = rhs_shape.Dims(rhs_rank - 1); + scratchpad_size.dim(rhs_rank - 1) = rhs_shape.Dims(rhs_rank - 2); + + rhs_scratchpad->resize(scratchpad_size); + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_BATCHMATMUL_H diff --git a/compiler/luci-interpreter/pal/linux/PALConv2d.h b/compiler/luci-interpreter/pal/linux/PALConv2d.h index 2550dd5d7..985a15f39 100644 --- a/compiler/luci-interpreter/pal/linux/PALConv2d.h +++ b/compiler/luci-interpreter/pal/linux/PALConv2d.h @@ -26,14 +26,24 @@ static inline void Conv(const tflite::ConvParams ¶ms, const tflite::RuntimeS const float *input_data, const tflite::RuntimeShape &filter_shape, const float *filter_data, const tflite::RuntimeShape &bias_shape, const float *bias_data, const tflite::RuntimeShape &output_shape, - float *output_data, const tflite::RuntimeShape &im2col_shape, - float *im2col_data) + float *output_data, const tflite::RuntimeShape &scratchpad_shape, + float *scratchpad_data) { - if (im2col_data) + (void)scratchpad_shape; + if (scratchpad_data) { + const int32_t batches = tflite::MatchingDim(input_shape, 0, output_shape, 0); + const int32_t input_depth = tflite::MatchingDim(input_shape, 3, filter_shape, 3); + const int32_t output_height = output_shape.Dims(1); + const int32_t output_width = output_shape.Dims(2); + const int32_t filter_height = filter_shape.Dims(1); + const int32_t filter_width = filter_shape.Dims(2); + tflite::RuntimeShape im2col_shape{batches, output_height, output_width, + input_depth * filter_height * filter_width}; + tflite::optimized_ops::Conv(params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data, im2col_shape, - im2col_data); + scratchpad_data); } else tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data, @@ -45,8 +55,8 @@ static inline void Conv(const tflite::ConvParams ¶ms, const tflite::RuntimeS const uint8 *input_data, const tflite::RuntimeShape &filter_shape, const uint8 *filter_data, const tflite::RuntimeShape &bias_shape, const int32 *bias_data, const tflite::RuntimeShape &output_shape, - uint8 *output_data, const tflite::RuntimeShape &im2col_shape, - uint8 *im2col_data) + uint8 *output_data, const tflite::RuntimeShape &scratchpad_shape, + uint8 *scratchpad_data) { // TODO This should only be done once (although it takes only a few microseconds). // Also, the user should be able to adjust the number of threads. @@ -54,8 +64,8 @@ static inline void Conv(const tflite::ConvParams ¶ms, const tflite::RuntimeS gemmlowp_context->set_max_num_threads(static_cast<int>(std::thread::hardware_concurrency())); tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data, - bias_shape, bias_data, output_shape, output_data, im2col_shape, - im2col_data, gemmlowp_context.get()); + bias_shape, bias_data, output_shape, output_data, scratchpad_shape, + scratchpad_data, gemmlowp_context.get()); } static inline void ConvPerChannel(const tflite::ConvParams ¶ms, const int32_t *mult, @@ -63,17 +73,55 @@ static inline void ConvPerChannel(const tflite::ConvParams ¶ms, const int32_ const int8 *input_data, const tflite::RuntimeShape &filter_shape, const int8 *filter_data, const tflite::RuntimeShape &bias_shape, const int32 *bias_data, const tflite::RuntimeShape &output_shape, - int8 *output_data, const tflite::RuntimeShape &im2col_shape, - int8 *im2col_data) + int8 *output_data, const tflite::RuntimeShape &scratchpad_shape, + int8 *scratchpad_data) { - (void)im2col_shape; - (void)im2col_data; + (void)scratchpad_shape; + (void)scratchpad_data; // TODO enable optimized version tflite::reference_integer_ops::ConvPerChannel(params, mult, shifts, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data); } +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const luci_interpreter::DataType &input_data_type, + const tflite::ConvParams ¶ms, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &filter_shape, + const tflite::RuntimeShape &output_shape) +{ + const int32_t filter_height = filter_shape.Dims(1); + const int32_t filter_width = filter_shape.Dims(2); + + // Allocate tensor for scratchpad, if needed. + // The checks here should be aligned with the actual implementation. + const bool need_dilated_scratchpad = + params.dilation_height_factor != 1 || params.dilation_width_factor != 1; + const bool need_non_dilated_scratchpad = params.stride_height != 1 || params.stride_width != 1 || + filter_height != 1 || filter_width != 1; + auto _need_scratchpad = input_data_type != luci_interpreter::DataType::S16 && + (need_dilated_scratchpad || need_non_dilated_scratchpad); + + if (_need_scratchpad) + { + const int32_t batches = tflite::MatchingDim(input_shape, 0, output_shape, 0); + const int32_t input_depth = tflite::MatchingDim(input_shape, 3, filter_shape, 3); + const int32_t output_height = output_shape.Dims(1); + const int32_t output_width = output_shape.Dims(2); + + auto data_type_size = static_cast<int32_t>(luci_interpreter::getDataTypeSize(input_data_type)); + int32_t scratchpad_size = batches * output_width * output_height * input_depth * filter_height * + filter_width * data_type_size; + luci_interpreter::Shape scratchpad_shape{scratchpad_size}; + scratchpad->resize(scratchpad_shape); + } + else + { + scratchpad->set_allocatable(false); + } +} + } // namespace luci_interpreter_pal #endif // LUCI_INTERPRETER_PAL_CONV2D_H diff --git a/compiler/luci-interpreter/pal/linux/PALDepthwiseConv2d.h b/compiler/luci-interpreter/pal/linux/PALDepthwiseConv2d.h new file mode 100644 index 000000000..c9d1a2948 --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/PALDepthwiseConv2d.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H +#define LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H + +#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h> +#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h> +#include <tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void +DepthwiseConvPerChannel(const tflite::DepthwiseParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const tflite::RuntimeShape &input_shape, + const T *input_data, const tflite::RuntimeShape &filter_shape, + const T *filter_data, const tflite::RuntimeShape &bias_shape, + const int32_t *bias_data, const tflite::RuntimeShape &output_shape, + T *output_data, const tflite::RuntimeShape &scratchpad_shape, + T *scratchpad_data) +{ + { + // MARK: At this moment this operation is not supported + assert(false && "DepthwiseConvPerChannel NYI"); + (void)params; + (void)output_multiplier; + (void)output_shift; + (void)input_shape; + (void)output_data; + (void)input_data; + (void)filter_shape; + (void)filter_data; + (void)bias_shape; + (void)bias_data; + (void)output_shape; + (void)output_data; + (void)scratchpad_shape; + (void)scratchpad_data; + } +} + +template <> +inline void DepthwiseConvPerChannel<int8_t>( + const tflite::DepthwiseParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, + const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data) +{ + (void)scratchpad_shape; + (void)scratchpad_data; + tflite::reference_integer_ops::DepthwiseConvPerChannel( + params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data); +} + +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const tflite::DepthwiseParams ¶ms, + const luci_interpreter::DataType &input_data_type, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &filter_shape, + const tflite::RuntimeShape &output_shape) + +{ + (void)params; + (void)input_data_type; + (void)input_shape; + (void)filter_shape; + (void)output_shape; + + scratchpad->set_allocatable(false); +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H diff --git a/compiler/luci-interpreter/pal/linux/PALDequantize.h b/compiler/luci-interpreter/pal/linux/PALDequantize.h new file mode 100644 index 000000000..3af6d0777 --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/PALDequantize.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_DEQUANTIZE_H +#define LUCI_INTERPRETER_PAL_DEQUANTIZE_H + +#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void Dequantize(tflite::DequantizationParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + tflite::optimized_ops::Dequantize(params, input_shape, input_data, output_shape, output_data); +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_DEQUANTIZE_H diff --git a/compiler/luci-interpreter/pal/linux/PALFullyConnected.h b/compiler/luci-interpreter/pal/linux/PALFullyConnected.h new file mode 100644 index 000000000..62970dbf7 --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/PALFullyConnected.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_FULLYCONNECTED_H +#define LUCI_INTERPRETER_PAL_FULLYCONNECTED_H + +#include <tensorflow/lite/kernels/internal/reference/fully_connected.h> +#include <tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void FullyConnected(const tflite::FullyConnectedParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &filter_shape, const T *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, T *output_data) +{ + { + // MARK: At this moment this operation doesn't support + assert(false && "FullyConnected NYI"); + (void)params; + (void)input_shape; + (void)input_data; + (void)filter_shape; + (void)filter_data; + (void)bias_shape; + (void)bias_data; + (void)output_shape; + (void)output_data; + } +} + +template <> +inline void +FullyConnected<int8_t>(const tflite::FullyConnectedParams ¶ms, + const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data) +{ + tflite::reference_integer_ops::FullyConnected(params, input_shape, input_data, filter_shape, + filter_data, bias_shape, bias_data, output_shape, + output_data); +} +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_FULLYCONNECTED_H diff --git a/compiler/luci-interpreter/pal/linux/PALGather.h b/compiler/luci-interpreter/pal/linux/PALGather.h new file mode 100644 index 000000000..49ac35f93 --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/PALGather.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_GATHER_H +#define LUCI_INTERPRETER_PAL_GATHER_H + +#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h> + +namespace luci_interpreter_pal +{ +template <typename T, typename CoordsT = int32> +static inline void Gather(const tflite::GatherParams &op_params, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &coords_shape, const CoordsT *coords_data, + const tflite::RuntimeShape &output_shape, T *output_data) +{ + tflite::optimized_ops::Gather(op_params, input_shape, input_data, coords_shape, coords_data, + output_shape, output_data); +} +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_GATHER_H diff --git a/compiler/luci-interpreter/pal/linux/PALMul.h b/compiler/luci-interpreter/pal/linux/PALMul.h index cfaec1b58..a8a9d4abc 100644 --- a/compiler/luci-interpreter/pal/linux/PALMul.h +++ b/compiler/luci-interpreter/pal/linux/PALMul.h @@ -21,21 +21,31 @@ namespace luci_interpreter_pal { +template <typename T> static inline void Mul(tflite::ArithmeticParams ¶ms, const tflite::RuntimeShape &input1_shape, - const float *input1_data, const tflite::RuntimeShape &input2_shape, - const float *input2_data, const tflite::RuntimeShape &output_shape, - float *output_data) + const T *input1_data, const tflite::RuntimeShape &input2_shape, + const T *input2_data, const tflite::RuntimeShape &output_shape, + T *output_data) { tflite::optimized_ops::Mul(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); } -static inline void BroadcastMul4DSlow(tflite::ArithmeticParams ¶ms, - const tflite::RuntimeShape &input1_shape, - const float *input1_data, - const tflite::RuntimeShape &input2_shape, - const float *input2_data, - const tflite::RuntimeShape &output_shape, float *output_data) +template <> +inline void Mul(tflite::ArithmeticParams ¶ms, const tflite::RuntimeShape &input1_shape, + const int64_t *input1_data, const tflite::RuntimeShape &input2_shape, + const int64_t *input2_data, const tflite::RuntimeShape &output_shape, + int64_t *output_data) +{ + tflite::optimized_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data); +} + +template <typename T> +static inline void +BroadcastMul4DSlow(tflite::ArithmeticParams ¶ms, const tflite::RuntimeShape &input1_shape, + const T *input1_data, const tflite::RuntimeShape &input2_shape, + const T *input2_data, const tflite::RuntimeShape &output_shape, T *output_data) { tflite::optimized_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); diff --git a/compiler/luci-interpreter/pal/linux/PALQuantize.h b/compiler/luci-interpreter/pal/linux/PALQuantize.h new file mode 100644 index 000000000..bf1d7954e --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/PALQuantize.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_QUANTIZE_H +#define LUCI_INTERPRETER_PAL_QUANTIZE_H + +#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void Quantize(tflite::QuantizationParams ¶ms, + const tflite::RuntimeShape &input_shape, const float *input_data, + const tflite::RuntimeShape &output_shape, T *output_data) +{ + tflite::optimized_ops::AffineQuantize(params, input_shape, input_data, output_shape, output_data); +} + +template <typename Input, typename Output> +static inline void Requantize(const Input *input_data, int32_t size, + int32_t effective_scale_multiplier, int32_t effective_scale_shift, + int32_t input_zero_point, int32_t output_zero_point, + Output *output_data) +{ + tflite::optimized_ops::Requantize(input_data, size, effective_scale_multiplier, + effective_scale_shift, input_zero_point, output_zero_point, + output_data); +} +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_QUANTIZE_H diff --git a/compiler/luci-interpreter/pal/linux/PALSVDF.h b/compiler/luci-interpreter/pal/linux/PALSVDF.h new file mode 100644 index 000000000..0ffba14f0 --- /dev/null +++ b/compiler/luci-interpreter/pal/linux/PALSVDF.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_SVDF_H +#define LUCI_INTERPRETER_PAL_SVDF_H + +#include <tensorflow/lite/kernels/internal/reference/svdf.h> + +namespace luci_interpreter_pal +{ +static inline void +IntegerSVDF(const TfLiteSVDFParams ¶ms, const tflite::RuntimeShape &input_shape, + const int8_t *input_data, const tflite::RuntimeShape &weight_feature_shape, + const int8_t *weight_feature_data, const tflite::RuntimeShape &weight_time_shape, + const int16_t *weight_time_data, const tflite::RuntimeShape &bias_shape, + const int32_t *bias_data, int16_t *activation_state_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, int32_t *scratchpad_data, + int32_t *output_temp_data, int32_t scale_1_a, int scale_1_b, int32_t scale_2_a, + int scale_2_b, int32_t input_zp, int32_t output_zp) +{ + tflite::reference_ops::EvalIntegerSVDF(¶ms, input_shape, input_data, weight_feature_shape, + weight_feature_data, weight_time_shape, weight_time_data, + bias_shape, bias_data, activation_state_data, output_shape, + output_data, scratchpad_data, output_temp_data, scale_1_a, + scale_1_b, scale_2_a, scale_2_b, input_zp, output_zp); +} +static inline void +FloatSVDF(const TfLiteSVDFParams ¶ms, const tflite::RuntimeShape &input_shape, + const float *input_data, const tflite::RuntimeShape &weight_feature_shape, + const float *weight_feature_data, const tflite::RuntimeShape &weight_time_shape, + const float *weight_time_data, const tflite::RuntimeShape &bias_shape, + const float *bias_data, float *scratchpad_data, float *activation_state_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + tflite::reference_ops::EvalFloatSVDF(¶ms, input_shape, input_data, weight_feature_shape, + weight_feature_data, weight_time_shape, weight_time_data, + bias_shape, bias_data, scratchpad_data, + activation_state_data, output_shape, output_data); +} + +static inline void SetupScratchpadTensor( + const luci_interpreter::DataType &input_data_type, + const luci_interpreter::DataType &weight_feature_data_type, + luci_interpreter::Tensor *scratchpad_1, luci_interpreter::Tensor *scratchpad_2, + luci_interpreter::Tensor *scratchpad_3, luci_interpreter::Tensor *scratchpad_4, + luci_interpreter::Tensor *scratchpad_5, luci_interpreter::Tensor *scratchpad_6, + const luci_interpreter::Shape input_shape, const luci_interpreter::Shape weight_time_shape, + const int32_t batch_size, const int32_t num_filters, const int32_t num_units) +{ + + if (input_data_type == loco::DataType::FLOAT32 && + (weight_feature_data_type == loco::DataType::S8 || + weight_feature_data_type == loco::DataType::U8)) + { + (void)input_shape; + (void)weight_time_shape; + (void)scratchpad_3; + (void)scratchpad_4; + (void)scratchpad_5; + (void)scratchpad_6; + + throw std::runtime_error("Hybrid type is not currently supported for linux platform"); + } + + // Resize scratchpad_1 tensor + scratchpad_1->resize({batch_size, num_filters}); + + if (input_data_type == loco::DataType::S8) + { + // Resize scratchpad_2 for full_integer op + scratchpad_2->resize({batch_size, num_units}); + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_SVDF_H diff --git a/compiler/luci-interpreter/pal/linux/pal.cmake b/compiler/luci-interpreter/pal/linux/pal.cmake index 84349e0bf..185700cf9 100644 --- a/compiler/luci-interpreter/pal/linux/pal.cmake +++ b/compiler/luci-interpreter/pal/linux/pal.cmake @@ -40,7 +40,35 @@ macro(add_pal_to_target TGT) # TODO put it back, I changed my mind. # instead add sources with visitors in this library - set(PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc) + set(PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/tensor_utils.cc + ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc + ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc) + + if(BUILD_ARM32_NEON) + # NOTE may need to revise this list for version upgrade + set(PAL_SOURCES ${PAL_SOURCES} + ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc + ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/optimized/cpu_check.cc + ${TensorFlowRuySource_DIR}/ruy/allocator.cc + ${TensorFlowRuySource_DIR}/ruy/block_map.cc + ${TensorFlowRuySource_DIR}/ruy/blocking_counter.cc + ${TensorFlowRuySource_DIR}/ruy/context_get_ctx.cc + ${TensorFlowRuySource_DIR}/ruy/cpuinfo.cc + ${TensorFlowRuySource_DIR}/ruy/ctx.cc + ${TensorFlowRuySource_DIR}/ruy/denormal.cc + ${TensorFlowRuySource_DIR}/ruy/frontend.cc + ${TensorFlowRuySource_DIR}/ruy/pack_arm.cc + ${TensorFlowRuySource_DIR}/ruy/prepacked_cache.cc + ${TensorFlowRuySource_DIR}/ruy/prepare_packed_matrices.cc + ${TensorFlowRuySource_DIR}/ruy/system_aligned_alloc.cc + ${TensorFlowRuySource_DIR}/ruy/thread_pool.cc + ${TensorFlowRuySource_DIR}/ruy/trmul.cc + ${TensorFlowRuySource_DIR}/ruy/tune.cc + ${TensorFlowRuySource_DIR}/ruy/wait.cc + ${TensorFlowRuySource_DIR}/ruy/kernel_arm32.cc + ) + endif(BUILD_ARM32_NEON) + add_library(luci_interpreter_linux_pal STATIC ${PAL_SOURCES}) set_target_properties(luci_interpreter_linux_pal PROPERTIES POSITION_INDEPENDENT_CODE ON) target_include_directories(luci_interpreter_linux_pal SYSTEM PRIVATE diff --git a/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst b/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst index 771974afe..d134a6b95 100644 --- a/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst +++ b/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst @@ -7,9 +7,11 @@ REGISTER_KERNEL(Concatenation) REGISTER_KERNEL(Conv2D) REGISTER_KERNEL(DepthToSpace) REGISTER_KERNEL(DepthwiseConv2D) +REGISTER_KERNEL(Dequantize) REGISTER_KERNEL(Div) REGISTER_KERNEL(Elu) REGISTER_KERNEL(Exp) +REGISTER_KERNEL(ExpandDims) REGISTER_KERNEL(Floor) REGISTER_KERNEL(FloorDiv) REGISTER_KERNEL(Equal) @@ -37,6 +39,7 @@ REGISTER_KERNEL(NotEqual) REGISTER_KERNEL(Pad) REGISTER_KERNEL(PadV2) REGISTER_KERNEL(PRelu) +REGISTER_KERNEL(Quantize) REGISTER_KERNEL(Reshape) REGISTER_KERNEL(ResizeBilinear) REGISTER_KERNEL(ResizeNearestNeighbor) @@ -50,6 +53,7 @@ REGISTER_KERNEL(Square) REGISTER_KERNEL(SquaredDifference) REGISTER_KERNEL(Squeeze) REGISTER_KERNEL(Sub) +REGISTER_KERNEL(SVDF) REGISTER_KERNEL(Tanh) REGISTER_KERNEL(Transpose) REGISTER_KERNEL(TransposeConv) diff --git a/compiler/luci-interpreter/pal/mcu/PALAveragePool2d.h b/compiler/luci-interpreter/pal/mcu/PALAveragePool2d.h new file mode 100644 index 000000000..cce30601f --- /dev/null +++ b/compiler/luci-interpreter/pal/mcu/PALAveragePool2d.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H +#define LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H + +#include <tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h> +#include <tensorflow/lite/kernels/internal/reference/pooling.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void AveragePool(const tflite::PoolParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &output_shape, T *output_data, + const tflite::RuntimeShape &scratchpad_shape, T *scratchpad_data) +{ + { + // MARK: At this moment this operation doesn't support + assert(false && "AveragePool NYI"); + (void)params; + (void)input_shape; + (void)input_data; + (void)output_shape; + (void)output_data; + (void)scratchpad_shape; + (void)scratchpad_data; + } +} + +template <> +inline void AveragePool<int8_t>(const tflite::PoolParams ¶ms, + const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, + const tflite::RuntimeShape &scratchpad_shape, + int8_t *scratchpad_data) +{ + (void)scratchpad_shape; + (void)scratchpad_data; + + tflite::reference_integer_ops::AveragePool(params, input_shape, input_data, output_shape, + output_data); +} + +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const luci_interpreter::DataType &input_data_type, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &output_shape) + +{ + (void)input_data_type; + (void)input_shape; + (void)output_shape; + + scratchpad->set_allocatable(false); +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H diff --git a/compiler/luci-interpreter/pal/mcu/PALConv2d.h b/compiler/luci-interpreter/pal/mcu/PALConv2d.h index 0a8ae4e48..13976877a 100644 --- a/compiler/luci-interpreter/pal/mcu/PALConv2d.h +++ b/compiler/luci-interpreter/pal/mcu/PALConv2d.h @@ -26,11 +26,11 @@ static inline void Conv(const tflite::ConvParams ¶ms, const tflite::RuntimeS const float *input_data, const tflite::RuntimeShape &filter_shape, const float *filter_data, const tflite::RuntimeShape &bias_shape, const float *bias_data, const tflite::RuntimeShape &output_shape, - float *output_data, const tflite::RuntimeShape &im2col_shape, - float *im2col_data) + float *output_data, const tflite::RuntimeShape &scratchpad_shape, + float *scratchpad_data) { - (void)im2col_shape; - (void)im2col_data; + (void)scratchpad_shape; + (void)scratchpad_data; tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data, tflite::RuntimeShape(), nullptr); @@ -40,14 +40,14 @@ static inline void Conv(const tflite::ConvParams ¶ms, const tflite::RuntimeS const uint8 *input_data, const tflite::RuntimeShape &filter_shape, const uint8 *filter_data, const tflite::RuntimeShape &bias_shape, const int32 *bias_data, const tflite::RuntimeShape &output_shape, - uint8 *output_data, const tflite::RuntimeShape &im2col_shape, - uint8 *im2col_data) + uint8 *output_data, const tflite::RuntimeShape &scratchpad_shape, + uint8 *scratchpad_data) { - (void)im2col_shape; - (void)im2col_data; + (void)scratchpad_shape; + (void)scratchpad_data; tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data, - bias_shape, bias_data, output_shape, output_data, im2col_shape, - im2col_data, nullptr); + bias_shape, bias_data, output_shape, output_data, scratchpad_shape, + scratchpad_data, nullptr); } static inline void ConvPerChannel(const tflite::ConvParams ¶ms, const int32_t *mult, @@ -55,16 +55,31 @@ static inline void ConvPerChannel(const tflite::ConvParams ¶ms, const int32_ const int8 *input_data, const tflite::RuntimeShape &filter_shape, const int8 *filter_data, const tflite::RuntimeShape &bias_shape, const int32 *bias_data, const tflite::RuntimeShape &output_shape, - int8 *output_data, const tflite::RuntimeShape &im2col_shape, - int8 *im2col_data) + int8 *output_data, const tflite::RuntimeShape &scratchpad_shape, + int8 *scratchpad_data) { - (void)im2col_shape; - (void)im2col_data; + (void)scratchpad_shape; + (void)scratchpad_data; tflite::reference_integer_ops::ConvPerChannel(params, mult, shifts, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data); } +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const luci_interpreter::DataType &input_data_type, + const tflite::ConvParams ¶ms, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &filter_shape, + const tflite::RuntimeShape &output_shape) +{ + (void)input_data_type; + (void)params; + (void)input_shape; + (void)filter_shape; + (void)output_shape; + scratchpad->set_allocatable(false); +} + } // namespace luci_interpreter_pal #endif // LUCI_INTERPRETER_PAL_CONV2D_H diff --git a/compiler/luci-interpreter/pal/mcu/PALDepthwiseConv2d.h b/compiler/luci-interpreter/pal/mcu/PALDepthwiseConv2d.h new file mode 100644 index 000000000..c9d1a2948 --- /dev/null +++ b/compiler/luci-interpreter/pal/mcu/PALDepthwiseConv2d.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H +#define LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H + +#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h> +#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h> +#include <tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void +DepthwiseConvPerChannel(const tflite::DepthwiseParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const tflite::RuntimeShape &input_shape, + const T *input_data, const tflite::RuntimeShape &filter_shape, + const T *filter_data, const tflite::RuntimeShape &bias_shape, + const int32_t *bias_data, const tflite::RuntimeShape &output_shape, + T *output_data, const tflite::RuntimeShape &scratchpad_shape, + T *scratchpad_data) +{ + { + // MARK: At this moment this operation is not supported + assert(false && "DepthwiseConvPerChannel NYI"); + (void)params; + (void)output_multiplier; + (void)output_shift; + (void)input_shape; + (void)output_data; + (void)input_data; + (void)filter_shape; + (void)filter_data; + (void)bias_shape; + (void)bias_data; + (void)output_shape; + (void)output_data; + (void)scratchpad_shape; + (void)scratchpad_data; + } +} + +template <> +inline void DepthwiseConvPerChannel<int8_t>( + const tflite::DepthwiseParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, + const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data) +{ + (void)scratchpad_shape; + (void)scratchpad_data; + tflite::reference_integer_ops::DepthwiseConvPerChannel( + params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data); +} + +static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad, + const tflite::DepthwiseParams ¶ms, + const luci_interpreter::DataType &input_data_type, + const tflite::RuntimeShape &input_shape, + const tflite::RuntimeShape &filter_shape, + const tflite::RuntimeShape &output_shape) + +{ + (void)params; + (void)input_data_type; + (void)input_shape; + (void)filter_shape; + (void)output_shape; + + scratchpad->set_allocatable(false); +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H diff --git a/compiler/luci-interpreter/pal/mcu/PALDequantize.h b/compiler/luci-interpreter/pal/mcu/PALDequantize.h new file mode 100644 index 000000000..15ff0327b --- /dev/null +++ b/compiler/luci-interpreter/pal/mcu/PALDequantize.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_DEQUANTIZE_H +#define LUCI_INTERPRETER_PAL_DEQUANTIZE_H + +#include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h" +#include "tensorflow/lite/kernels/internal/reference/reference_ops.h" + +namespace luci_interpreter_pal +{ + +template <typename T> +static inline void Dequantize(tflite::DequantizationParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + tflite::reference_integer_ops::Dequantize<T>(params, input_shape, input_data, output_shape, + output_data); +} + +static inline void Dequantize(tflite::DequantizationParams ¶ms, + const tflite::RuntimeShape &input_shape, const uint8_t *input_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + tflite::reference_ops::Dequantize(params, input_shape, input_data, output_shape, output_data); +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_DEQUANTIZE_H diff --git a/compiler/luci-interpreter/pal/mcu/PALFullyConnected.h b/compiler/luci-interpreter/pal/mcu/PALFullyConnected.h new file mode 100644 index 000000000..048624d74 --- /dev/null +++ b/compiler/luci-interpreter/pal/mcu/PALFullyConnected.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_FULLYCONNECTED_H +#define LUCI_INTERPRETER_PAL_FULLYCONNECTED_H + +#include <tensorflow/lite/kernels/internal/reference/fully_connected.h> +#include <tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h> + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void FullyConnected(const tflite::FullyConnectedParams ¶ms, + const tflite::RuntimeShape &input_shape, const T *input_data, + const tflite::RuntimeShape &filter_shape, const T *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, T *output_data) +{ + { + // MARK: At this moment this operation is not supported + assert(false && "FullyConnected NYI"); + (void)params; + (void)input_shape; + (void)input_data; + (void)filter_shape; + (void)filter_data; + (void)bias_shape; + (void)bias_data; + (void)output_shape; + (void)output_data; + } +} + +template <> +inline void +FullyConnected<int8_t>(const tflite::FullyConnectedParams ¶ms, + const tflite::RuntimeShape &input_shape, const int8_t *input_data, + const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, + const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data) +{ + tflite::reference_integer_ops::FullyConnected(params, input_shape, input_data, filter_shape, + filter_data, bias_shape, bias_data, output_shape, + output_data); +} +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_FULLYCONNECTED_H diff --git a/compiler/luci-interpreter/pal/mcu/PALMul.h b/compiler/luci-interpreter/pal/mcu/PALMul.h index 2b46b100c..347a97a83 100644 --- a/compiler/luci-interpreter/pal/mcu/PALMul.h +++ b/compiler/luci-interpreter/pal/mcu/PALMul.h @@ -21,21 +21,21 @@ namespace luci_interpreter_pal { +template <typename T> static inline void Mul(tflite::ArithmeticParams ¶ms, const tflite::RuntimeShape &input1_shape, - const float *input1_data, const tflite::RuntimeShape &input2_shape, - const float *input2_data, const tflite::RuntimeShape &output_shape, - float *output_data) + const T *input1_data, const tflite::RuntimeShape &input2_shape, + const T *input2_data, const tflite::RuntimeShape &output_shape, + T *output_data) { tflite::reference_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); } -static inline void BroadcastMul4DSlow(tflite::ArithmeticParams ¶ms, - const tflite::RuntimeShape &input1_shape, - const float *input1_data, - const tflite::RuntimeShape &input2_shape, - const float *input2_data, - const tflite::RuntimeShape &output_shape, float *output_data) +template <typename T> +static inline void +BroadcastMul4DSlow(tflite::ArithmeticParams ¶ms, const tflite::RuntimeShape &input1_shape, + const T *input1_data, const tflite::RuntimeShape &input2_shape, + const T *input2_data, const tflite::RuntimeShape &output_shape, T *output_data) { tflite::reference_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); diff --git a/compiler/luci-interpreter/pal/mcu/PALQuantize.h b/compiler/luci-interpreter/pal/mcu/PALQuantize.h new file mode 100644 index 000000000..6046789ae --- /dev/null +++ b/compiler/luci-interpreter/pal/mcu/PALQuantize.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_QUANTIZE_H +#define LUCI_INTERPRETER_PAL_QUANTIZE_H + +#include "tensorflow/lite/kernels/internal/reference/reference_ops.h" + +namespace luci_interpreter_pal +{ +template <typename T> +static inline void Quantize(tflite::QuantizationParams ¶ms, + const tflite::RuntimeShape &input_shape, const float *input_data, + const tflite::RuntimeShape &output_shape, T *output_data) +{ + tflite::reference_ops::AffineQuantize(params, input_shape, input_data, output_shape, output_data); +} + +template <typename Input, typename Output> +static inline void Requantize(const Input *input_data, int32_t size, + int32_t effective_scale_multiplier, int32_t effective_scale_shift, + int32_t input_zero_point, int32_t output_zero_point, + Output *output_data) +{ + tflite::reference_ops::Requantize(input_data, size, effective_scale_multiplier, + effective_scale_shift, input_zero_point, output_zero_point, + output_data); +} +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_QUANTIZE_H diff --git a/compiler/luci-interpreter/pal/mcu/PALSVDF.h b/compiler/luci-interpreter/pal/mcu/PALSVDF.h new file mode 100644 index 000000000..3bba668fb --- /dev/null +++ b/compiler/luci-interpreter/pal/mcu/PALSVDF.h @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2020 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_SVDF_H +#define LUCI_INTERPRETER_PAL_SVDF_H + +#include <tensorflow/lite/kernels/internal/reference/svdf.h> + +namespace luci_interpreter_pal +{ +static inline void +IntegerSVDF(const TfLiteSVDFParams ¶ms, const tflite::RuntimeShape &input_shape, + const int8_t *input_data, const tflite::RuntimeShape &weight_feature_shape, + const int8_t *weight_feature_data, const tflite::RuntimeShape &weight_time_shape, + const int16_t *weight_time_data, const tflite::RuntimeShape &bias_shape, + const int32_t *bias_data, int16_t *activation_state_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data, int32_t *scratchpad_data, + int32_t *output_temp_data, int32_t scale_1_a, int scale_1_b, int32_t scale_2_a, + int scale_2_b, int32_t input_zp, int32_t output_zp) +{ + const int n_rank = params.rank; + const int n_batch = input_shape.Dims(0); + const int n_input = input_shape.Dims(1); + const int n_filter = weight_feature_shape.Dims(0); + const int n_unit = n_filter / n_rank; + const int n_memory = weight_time_shape.Dims(1); + + // Left shift the activation_state. + { + int16_t *new_state_start = activation_state_data; + const int16_t *old_state_start = activation_state_data + 1; + const int16_t *old_state_end = activation_state_data + n_batch * n_filter * n_memory; + while (old_state_start != old_state_end) + { + *new_state_start++ = *old_state_start++; + } + } + + // Note: no need to clear the latest activation, matmul is not accumulative. + + // Feature matmul. + { + const int32_t output_max = std::numeric_limits<int16_t>::max(); + const int32_t output_min = std::numeric_limits<int16_t>::min(); + int16_t *result_in_batch = activation_state_data + (n_memory - 1); + for (int b = 0; b < n_batch; b++) + { + const int8_t *matrix_ptr = weight_feature_data; + for (int r = 0; r < n_filter; r++) + { + int32_t dot_prod = 0; + const int8_t *vector_in_batch = input_data + b * n_input; + for (int c = 0; c < n_input; c++) + { + dot_prod += *matrix_ptr++ * (*vector_in_batch++ - input_zp); + } + dot_prod = tflite::MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b); + dot_prod = std::min(std::max(output_min, dot_prod), output_max); + // This assumes state is symmetrically quantized. Otherwise last bit of + // state should be initialized to its zero point and accumulate the + // dot_prod. + // Equivalent as the following: + // result_in_batch = zero point, which happens to be zero. + // result_in_batch += dot_prod_56. + *result_in_batch = dot_prod; + result_in_batch += n_memory; + } + } + } + + // Time. + { + for (int b = 0; b < n_batch; ++b) + { + int32_t *scratch_ptr_batch = scratchpad_data + b * n_filter; + + // Perform batched vector dot product: + const int16_t *vector1_ptr = weight_time_data; + const int16_t *vector2_ptr = activation_state_data + b * n_memory * n_filter; + + for (int i = 0; i < n_filter; i++) + { + *scratch_ptr_batch = 0; + for (int j = 0; j < n_memory; j++) + { + *scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++; + } + scratch_ptr_batch++; + } + } + } + + // Reduce, add bias, rescale, activation. + { + // Add bias. + if (bias_data) + { + // Vector batch assign: + for (int i = 0; i < n_batch; ++i) + { + int32_t *output_ptr = output_temp_data + i * n_unit; + const int32_t *bias_ptr = bias_data; + for (int j = 0; j < n_unit; ++j) + { + *output_ptr++ = *bias_ptr++; + } + } + } + else + { + int32_t *output_ptr = output_temp_data; + for (int i = 0; i < n_batch * n_unit; ++i) + { + *output_ptr++ = 0; + } + } + + // Reduce. + for (int b = 0; b < n_batch; ++b) + { + int32_t *output_temp_ptr = output_temp_data + b * n_unit; + int32_t *scratch_ptr_batch = scratchpad_data + b * n_filter; + + // Reduction sum vector + for (int i = 0; i < n_unit; ++i) + { + for (int j = 0; j < n_rank; ++j) + { + output_temp_ptr[i] += *scratch_ptr_batch++; + } + } + } + + // Rescale. + const int32_t output_max = std::numeric_limits<int8_t>::max(); + const int32_t output_min = std::numeric_limits<int8_t>::min(); + for (int i = 0; i < n_batch * n_unit; ++i) + { + int32_t x1 = output_temp_data[i]; + int32_t x2 = tflite::MultiplyByQuantizedMultiplier(x1, scale_2_a, scale_2_b); + int32_t x3 = x2 + output_zp; + int32_t x4 = std::min(std::max(output_min, x3), output_max); + output_data[i] = static_cast<int8_t>(x4); + } + } +} +static inline void +FloatSVDF(const TfLiteSVDFParams ¶ms, const tflite::RuntimeShape &input_shape, + const float *input_data, const tflite::RuntimeShape &weight_feature_shape, + const float *weight_feature_data, const tflite::RuntimeShape &weight_time_shape, + const float *weight_time_data, const tflite::RuntimeShape &bias_shape, + const float *bias_data, float *scratchpad_data, float *activation_state_data, + const tflite::RuntimeShape &output_shape, float *output_data) +{ + const int32_t rank = params.rank; + const int32_t batch_size = input_shape.Dims(0); + const int32_t input_size = input_shape.Dims(1); + const int32_t num_filters = weight_feature_shape.Dims(0); + const int32_t num_units = num_filters / rank; + const int32_t memory_size = weight_time_shape.Dims(1); + + // Left shift the activation_state. + { + float *new_state_start = activation_state_data; + const float *old_state_start = activation_state_data + 1; + const float *old_state_end = activation_state_data + batch_size * num_filters * memory_size; + while (old_state_start != old_state_end) + { + *new_state_start++ = *old_state_start++; + } + } + + // Note: no need to clear the latest activation, matmul is not accumulative. + + // Compute conv1d(inputs, weights_feature). + // The activation_state's rightmost column is used to save current cycle + // activation. This is achieved by starting at state_ptr[memory_size - 1] and + // having the stride equal to memory_size. + + // Perform batched matrix vector multiply operation: + { + const float *matrix = weight_feature_data; + const float *vector = input_data; + float *result = &activation_state_data[memory_size - 1]; + float *result_in_batch = result; + for (int i = 0; i < batch_size; ++i) + { + const float *matrix_ptr = matrix; + for (int j = 0; j < num_filters; ++j) + { + float dot_prod = 0.0f; + const float *vector_in_batch = vector + i * input_size; + for (int k = 0; k < input_size; ++k) + { + dot_prod += *matrix_ptr++ * *vector_in_batch++; + } + *result_in_batch = dot_prod; + result_in_batch += memory_size; + } + } + } + + tflite::reference_ops::ApplyTimeWeightsBiasAndActivation( + batch_size, memory_size, num_filters, num_units, rank, weight_time_data, bias_data, + params.activation, activation_state_data, scratchpad_data, output_data); +} + +static inline void SetupScratchpadTensor( + const luci_interpreter::DataType &input_data_type, + const luci_interpreter::DataType &weight_feature_data_type, + luci_interpreter::Tensor *scratchpad_1, luci_interpreter::Tensor *scratchpad_2, + luci_interpreter::Tensor *scratchpad_3, luci_interpreter::Tensor *scratchpad_4, + luci_interpreter::Tensor *scratchpad_5, luci_interpreter::Tensor *scratchpad_6, + const luci_interpreter::Shape input_shape, const luci_interpreter::Shape weight_time_shape, + const int32_t batch_size, const int32_t num_filters, const int32_t num_units) +{ + + if (input_data_type == loco::DataType::FLOAT32 && + (weight_feature_data_type == loco::DataType::S8 || + weight_feature_data_type == loco::DataType::U8)) + { + (void)input_shape; + (void)weight_time_shape; + (void)scratchpad_3; + (void)scratchpad_4; + (void)scratchpad_5; + (void)scratchpad_6; + + throw std::runtime_error("Hybrid type is not currently supported for mcu platform"); + } + + // Resize scratchpad_1 tensor + scratchpad_1->resize({batch_size, num_filters}); + + if (input_data_type == loco::DataType::S8) + { + // Resize scratchpad_2 for full_integer op + scratchpad_2->resize({batch_size, num_units}); + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_SVDF_H diff --git a/compiler/luci-interpreter/pal/mcu/pal.cmake b/compiler/luci-interpreter/pal/mcu/pal.cmake index a479d407b..907d51de6 100644 --- a/compiler/luci-interpreter/pal/mcu/pal.cmake +++ b/compiler/luci-interpreter/pal/mcu/pal.cmake @@ -39,7 +39,9 @@ macro(add_pal_to_target TGT) # TODO put it back, I changed my mind. # instead add sources with visitors in this library - set(PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc) + set(PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc + ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/tensor_utils.cc + ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc) add_library(luci_interpreter_mcu_pal STATIC ${PAL_SOURCES}) set_target_properties(luci_interpreter_mcu_pal PROPERTIES POSITION_INDEPENDENT_CODE ON) target_include_directories(luci_interpreter_mcu_pal PRIVATE diff --git a/compiler/luci-interpreter/src/CMakeLists.txt b/compiler/luci-interpreter/src/CMakeLists.txt index e37150336..997b75a84 100644 --- a/compiler/luci-interpreter/src/CMakeLists.txt +++ b/compiler/luci-interpreter/src/CMakeLists.txt @@ -13,6 +13,7 @@ set(LUCI_INTERPRETER_BINARY "luci_interpreter${LUCI_INTERPRETER_SUFFIX}") set(LUCI_INTERPRETER_CORE "luci_interpreter_core${LUCI_INTERPRETER_SUFFIX}") set(LUCI_INTERPRETER_KERNELS "luci_interpreter_kernels${LUCI_INTERPRETER_SUFFIX}") set(LUCI_INTERPRETER_LOADER "luci_interpreter_loader${LUCI_INTERPRETER_SUFFIX}") +set(LUCI_INTERPRETER_IMPORT "luci_interpreter_import${LUCI_INTERPRETER_SUFFIX}") add_subdirectory(core) message(STATUS "LUCI INTERPRETER CORE") @@ -20,6 +21,8 @@ add_subdirectory(kernels) message(STATUS "LUCI INTERPRETER KERNELS") add_subdirectory(loader) message(STATUS "LUCI INTERPRETER LOADER") +add_subdirectory(import) +message(STATUS "LUCI INTERPRETER IMPORT") message(STATUS "LUCI INTERPTER INITALIZED") diff --git a/compiler/luci-interpreter/src/Interpreter.cpp b/compiler/luci-interpreter/src/Interpreter.cpp index 1b8792a6c..8cf272efd 100644 --- a/compiler/luci-interpreter/src/Interpreter.cpp +++ b/compiler/luci-interpreter/src/Interpreter.cpp @@ -70,25 +70,30 @@ private: } // namespace +Interpreter::Interpreter(const luci::Module *module) +{ + _runtime_to_ir = std::make_unique<RuntimeToIR>(); + _event_notifier = std::make_unique<EventNotifierImpl>(*_runtime_to_ir, _observers); + _runtime_module = std::make_unique<RuntimeModule>(_event_notifier.get()); + + _default_memory_manager = std::make_unique<SimpleMemoryManager>(); + + ModuleLoader loader(module, _runtime_module.get(), *_runtime_to_ir, _node_to_tensor, + _default_memory_manager.get()); + loader.load(); +} + Interpreter::Interpreter(const luci::Module *module, luci_interpreter::IMemoryManager *memory_manager) { + assert(memory_manager && "Use Interpreter::Interpreter(module) constructor instead"); + _runtime_to_ir = std::make_unique<RuntimeToIR>(); _event_notifier = std::make_unique<EventNotifierImpl>(*_runtime_to_ir, _observers); _runtime_module = std::make_unique<RuntimeModule>(_event_notifier.get()); - if (memory_manager == nullptr) - { - _default_memory_manager = std::make_unique<SimpleMemoryManager>(); - _memory_manager = _default_memory_manager.get(); - } - else - { - _memory_manager = memory_manager; - } - ModuleLoader loader(module, _runtime_module.get(), *_runtime_to_ir, _node_to_tensor, - _memory_manager); + memory_manager); loader.load(); } diff --git a/compiler/luci-interpreter/src/core/CMakeLists.txt b/compiler/luci-interpreter/src/core/CMakeLists.txt index 4430cba11..c2471e01c 100644 --- a/compiler/luci-interpreter/src/core/CMakeLists.txt +++ b/compiler/luci-interpreter/src/core/CMakeLists.txt @@ -10,7 +10,9 @@ set(SOURCES Tensor.cpp) add_library(${LUCI_INTERPRETER_CORE} STATIC ${SOURCES}) -set_target_properties(${LUCI_INTERPRETER_CORE} PROPERTIES POSITION_INDEPENDENT_CODE ON) +if (NOT NNCC_LIBRARY_NO_PIC) + set_target_properties(${LUCI_INTERPRETER_CORE} PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif(NOT NNCC_LIBRARY_NO_PIC) target_include_directories(${LUCI_INTERPRETER_CORE} PUBLIC "${LUCI_INTERPRETER_INCLUDE_DIR}") target_include_directories(${LUCI_INTERPRETER_CORE} PUBLIC "${LUCI_INTERPRETER_SOURCE_DIR}") target_link_libraries(${LUCI_INTERPRETER_CORE} PUBLIC luci_lang) diff --git a/compiler/luci-interpreter/src/core/KernelParams.h b/compiler/luci-interpreter/src/core/KernelParams.h index ee0390fcc..958fd4b74 100644 --- a/compiler/luci-interpreter/src/core/KernelParams.h +++ b/compiler/luci-interpreter/src/core/KernelParams.h @@ -43,6 +43,12 @@ struct ArgMaxParams DataType output_type; }; +struct BatchMatMulParams +{ + bool adj_x; + bool adj_y; +}; + struct ConcatenationParams { int axis; @@ -83,6 +89,13 @@ struct DivParams struct FullyConnectedParams { Activation activation; + bool keep_num_dims = false; +}; + +struct GatherParams +{ + int32_t axis; + int32_t batch_dims; }; struct InstanceNormParams @@ -119,6 +132,11 @@ struct MulParams Activation activation; }; +struct OneHotParams +{ + int32_t axis; +}; + struct PackParams { int32_t values_count; @@ -157,6 +175,13 @@ struct SubParams Activation activation; }; +struct SVDFParams +{ + bool asymmetric_quantize_inputs; + int32_t svdf_rank; + Activation activation; +}; + struct SpaceToDepthParams { int block_size; diff --git a/compiler/luci-interpreter/src/import/CMakeLists.txt b/compiler/luci-interpreter/src/import/CMakeLists.txt new file mode 100644 index 000000000..dd9733f92 --- /dev/null +++ b/compiler/luci-interpreter/src/import/CMakeLists.txt @@ -0,0 +1,15 @@ +set(SOURCES + "${LUCI_INTERPRETER_INCLUDE_DIR}/luci_interpreter/GraphBuilderRegistry.h" + GraphBuilderRegistry.cpp) + +# include specific builders +file(GLOB_RECURSE NODES "Nodes/*") +list(APPEND SOURCES ${NODES}) + +add_library(${LUCI_INTERPRETER_IMPORT} STATIC ${SOURCES}) +if (NOT NNCC_LIBRARY_NO_PIC) + set_target_properties(${LUCI_INTERPRETER_IMPORT} PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif(NOT NNCC_LIBRARY_NO_PIC) + +target_include_directories(${LUCI_INTERPRETER_IMPORT} PUBLIC "${LUCI_INTERPRETER_INCLUDE_DIR}") +target_link_libraries(${LUCI_INTERPRETER_IMPORT} PUBLIC luci_import) diff --git a/compiler/luci-interpreter/src/import/GraphBuilderRegistry.cpp b/compiler/luci-interpreter/src/import/GraphBuilderRegistry.cpp new file mode 100644 index 000000000..a33bca6a4 --- /dev/null +++ b/compiler/luci-interpreter/src/import/GraphBuilderRegistry.cpp @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "luci_interpreter/GraphBuilderRegistry.h" +#include "Nodes/CircleReferencingConst.h" + +namespace luci_interpreter +{ + +std::unique_ptr<luci::GraphBuilderSource> source_without_constant_copying() +{ + auto builder = std::make_unique<luci::GraphBuilderRegistry>(); + { + // redefine NodeBuilder of BUFFER type + builder->add(std::make_unique<CircleReferencingConstNodeBuilder>()); + } + + return builder; +} + +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/import/Nodes/CircleReferencingConst.cpp b/compiler/luci-interpreter/src/import/Nodes/CircleReferencingConst.cpp new file mode 100644 index 000000000..14e90f240 --- /dev/null +++ b/compiler/luci-interpreter/src/import/Nodes/CircleReferencingConst.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "CircleReferencingConst.h" + +#include <vector> + +namespace +{ + +// helper struct which describes data loaded to custom_options of CircleReferencingConst node +struct ConstDataReference +{ + const uint8_t *data = nullptr; + uint32_t size = 0; +}; + +} // namespace + +namespace luci_interpreter +{ +using namespace luci; + +CircleNode *CircleReferencingConstNodeBuilder::build(TensorIndex tensor_index, + GraphBuilderContext *context) const +{ + assert(tensor_index >= 0); + + const auto graph = context->graph(); + const auto reader = context->reader(); + const auto tensors = reader->tensors(); + auto const const_tensor = tensors[tensor_index]; + assert(const_tensor != nullptr); + if (const_tensor->is_variable()) + { + // Create CircleVariable for variable + return nullptr; + } + + auto const buffer = wrap(reader->buffers()[const_tensor->buffer()]->data()); + auto const const_dims = wrap(const_tensor->shape()); // in NHWC + if (const_dims.empty() && buffer.empty()) + { + // unknown shape tensor and scalar tensor + return nullptr; + } + + // if tensor_index is used as output to some other operator, this is not a constant + auto tensoroutputs = context->tensoroutputs(); + if (tensoroutputs->find(tensor_index)) + { + // other operator output tensor + return nullptr; + } + + uint32_t num_elements = 1; + for (uint32_t r = 0; r < const_dims.size(); ++r) + { + num_elements = num_elements * const_dims[r]; + } + + if (buffer.empty() && num_elements > 0) + { + // normal empty tensor + return nullptr; + } + + // create CircleReferencingConst + auto custom_node = graph->nodes()->create<CircleCustom>(0, 1); + { + custom_node->custom_code("CircleReferencingConst"); + + copy_tensor_attributes(const_tensor, custom_node); + custom_node->shape_status(luci::ShapeStatus::VALID); + + // custom options stores size of buffer and pointer's value to buffer's data + { + std::vector<uint8_t> custom_options(sizeof(ConstDataReference)); + { + auto &const_data_ref = *reinterpret_cast<ConstDataReference *>(custom_options.data()); + const_data_ref = {buffer.data(), buffer.size()}; + } + custom_node->custom_options(custom_options); + } + } + + // Output of CircleCustom node presented with CircleConstNode + auto out_node = graph->nodes()->create<CircleCustomOut>(); + { + out_node->index(0); + out_node->input(custom_node); + + copy_tensor_attributes(const_tensor, out_node); + out_node->shape_status(luci::ShapeStatus::VALID); + } + + return out_node; +} + +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/import/Nodes/CircleReferencingConst.h b/compiler/luci-interpreter/src/import/Nodes/CircleReferencingConst.h new file mode 100644 index 000000000..ed8f95124 --- /dev/null +++ b/compiler/luci-interpreter/src/import/Nodes/CircleReferencingConst.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __LUCI_INTERPRETER_IMPORT_OP_CIRCLE_REFERENCING_CONST_H__ +#define __LUCI_INTERPRETER_IMPORT_OP_CIRCLE_REFERENCING_CONST_H__ + +#include <luci/Import/NodeBuilder.h> + +#include <luci/IR/Nodes/CircleConst.h> + +namespace luci_interpreter +{ +using namespace luci; + +/** + * @brief Builder creates CircleCustom node with pointer to constants data from Tensor with buffer. + */ +class CircleReferencingConstNodeBuilder : public TypedNodeBuilder<NodeBuilderType::BUFFER> +{ +public: + CircleNode *build(TensorIndex tensor_index, GraphBuilderContext *ctx) const final; +}; + +} // namespace luci_interpreter + +#endif // __LUCI_INTERPRETER_IMPORT_OP_CIRCLE_REFERENCING_CONST_H__ diff --git a/compiler/luci-interpreter/src/kernels/Add.cpp b/compiler/luci-interpreter/src/kernels/Add.cpp index 7381c3849..d7bf3084f 100644 --- a/compiler/luci-interpreter/src/kernels/Add.cpp +++ b/compiler/luci-interpreter/src/kernels/Add.cpp @@ -38,8 +38,11 @@ Add::Add(const Tensor *input1, const Tensor *input2, Tensor *output, const AddPa void Add::configure() { LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type()); + LUCI_INTERPRETER_CHECK(input1()->element_type() == output()->element_type()); if (input1()->element_type() == DataType::S16) { + LUCI_INTERPRETER_CHECK(input1()->zero_points().size() == 1 && + input2()->zero_points().size() == 1); LUCI_INTERPRETER_CHECK(input1()->zero_point() == 0 && input2()->zero_point() == 0 && output()->zero_point() == 0); } @@ -54,6 +57,12 @@ void Add::execute() const case DataType::FLOAT32: evalFloat(); break; + case DataType::S64: + evalInteger<int64_t>(); + break; + case DataType::S32: + evalInteger<int32_t>(); + break; case DataType::U8: evalQuantized(); break; @@ -67,13 +76,8 @@ void Add::execute() const void Add::evalFloat() const { - float activation_min{}; - float activation_max{}; - calculateActivationRange(_params.activation, &activation_min, &activation_max); - tflite::ArithmeticParams params{}; - params.float_activation_min = activation_min; - params.float_activation_max = activation_max; + fillArithmeticActivationRange<float>(params, _params.activation); const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes( getTensorShape(input1()), getTensorShape(input2()), ¶ms); @@ -92,6 +96,28 @@ void Add::evalFloat() const } } +template <typename T> void Add::evalInteger() const +{ + tflite::ArithmeticParams params{}; + fillArithmeticActivationRange<T>(params, _params.activation); + + const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes( + getTensorShape(input1()), getTensorShape(input2()), ¶ms); + + if (need_broadcast) + { + tflite::reference_ops::BroadcastAdd4DSlow( + params, getTensorShape(input1()), getTensorData<T>(input1()), getTensorShape(input2()), + getTensorData<T>(input2()), getTensorShape(output()), getTensorData<T>(output())); + } + else + { + tflite::reference_ops::Add(params, getTensorShape(input1()), getTensorData<T>(input1()), + getTensorShape(input2()), getTensorData<T>(input2()), + getTensorShape(output()), getTensorData<T>(output())); + } +} + void Add::evalQuantized() const { const auto input1_scale = static_cast<double>(input1()->scale()); diff --git a/compiler/luci-interpreter/src/kernels/Add.h b/compiler/luci-interpreter/src/kernels/Add.h index 79518845d..91d95b6af 100644 --- a/compiler/luci-interpreter/src/kernels/Add.h +++ b/compiler/luci-interpreter/src/kernels/Add.h @@ -39,6 +39,7 @@ public: private: void evalFloat() const; + template <typename T> void evalInteger() const; void evalQuantized() const; void evalQuantizedS16() const; }; diff --git a/compiler/luci-interpreter/src/kernels/Add.test.cpp b/compiler/luci-interpreter/src/kernels/Add.test.cpp index 847b65667..b8b1c3089 100644 --- a/compiler/luci-interpreter/src/kernels/Add.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Add.test.cpp @@ -166,6 +166,69 @@ TEST_F(AddTest, Float) } } +template <loco::DataType DType> void CheckInteger(luci_interpreter::IMemoryManager *memory_manager) +{ + using dtype = typename loco::DataTypeImpl<DType>::Type; + Shape base_shape = {2, 3, 1, 2}; + std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}}; + std::vector<std::vector<dtype>> test_outputs = { + {3, 3, 0, 1, 0, 8, 5, 1, 0, 0, 2, 6, 8, 0, 1, 0, 5, 1, + 5, 4, 0, 2, 2, 9, 11, 0, 4, 0, 8, 5, 11, 2, 4, 0, 8, 7}, + {3, 3, 0, 0, 5, 1, 5, 4, 4, 0, 8, 7}, + {3, 6, 0, 3, 0, 0, 5, 4, 2, 1, 0, 0, 8, 0, 5, 0, 1, 0, + 0, 2, 2, 4, 7, 9, 6, 0, 8, 0, 13, 5, 6, 0, 8, 2, 13, 7}, + {3, 6, 2, 1, 1, 0, 0, 2, 8, 0, 13, 7}}; + std::vector<dtype> input1_data{-1, 2, 1, 0, 4, -5, 1, 3, 7, -1, 7, 1}; + std::vector<dtype> input2_data{4, 1, -3, -1, 1, 6}; + for (size_t i = 0; i < test_shapes.size(); ++i) + { + Tensor input1_tensor = makeInputTensor<DType>(base_shape, input1_data, memory_manager); + Tensor input2_tensor = makeInputTensor<DType>(test_shapes[i], input2_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DType); + + AddParams params{}; + params.activation = Activation::RELU; + + Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<dtype>(output_tensor), test_outputs[i]) + << "With shape number " << i; + } + // Re-run with exchanged inputs. + for (size_t i = 0; i < test_shapes.size(); ++i) + { + Tensor input1_tensor = makeInputTensor<DType>(test_shapes[i], input2_data, memory_manager); + Tensor input2_tensor = makeInputTensor<DType>(base_shape, input1_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DType); + + AddParams params{}; + params.activation = Activation::RELU; + + Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<dtype>(output_tensor), test_outputs[i]) + << "With shape number " << i; + } +}; + +TEST_F(AddTest, SInt32) +{ + CheckInteger<loco::DataType::S32>(_memory_manager.get()); + SUCCEED(); +} + +TEST_F(AddTest, SInt64) +{ + CheckInteger<loco::DataType::S64>(_memory_manager.get()); + SUCCEED(); +} + TEST_F(AddTest, SInt16) { Shape base_shape = {2, 3, 1, 2}; @@ -248,11 +311,24 @@ TEST_F(AddTest, Input_Output_Type_NEG) EXPECT_ANY_THROW(kernel.configure()); } -TEST_F(AddTest, Invalid_Input_Type_NEG) +TEST_F(AddTest, Invalid_Output_Type_NEG) { Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get()); Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2}, _memory_manager.get()); - Tensor output_tensor = makeOutputTensor(DataType::S64); + Tensor output_tensor = makeOutputTensor(DataType::S32); + + AddParams params{}; + params.activation = Activation::RELU; + + Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(AddTest, Invalid_Input_Type_NEG) +{ + Tensor input1_tensor = makeInputTensor<DataType::U64>({1}, {1}, _memory_manager.get()); + Tensor input2_tensor = makeInputTensor<DataType::U64>({1}, {2}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::U64); AddParams params{}; params.activation = Activation::RELU; @@ -263,6 +339,19 @@ TEST_F(AddTest, Invalid_Input_Type_NEG) EXPECT_ANY_THROW(kernel.execute()); } +TEST_F(AddTest, Invalid_Quantization_NEG) +{ + Tensor input1_tensor = makeInputTensor<DataType::S16>({1}, {1}, _memory_manager.get()); + Tensor input2_tensor = makeInputTensor<DataType::S16>({1}, {2}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S16); + + AddParams params{}; + params.activation = Activation::NONE; + + Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params); + EXPECT_ANY_THROW(kernel.configure()); +} + } // namespace } // namespace kernels } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/ArgMax.test.cpp b/compiler/luci-interpreter/src/kernels/ArgMax.test.cpp index 119c69ccf..474f4b321 100644 --- a/compiler/luci-interpreter/src/kernels/ArgMax.test.cpp +++ b/compiler/luci-interpreter/src/kernels/ArgMax.test.cpp @@ -57,7 +57,7 @@ template <typename T> class ArgMaxTest : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t>; -TYPED_TEST_CASE(ArgMaxTest, DataTypes); +TYPED_TEST_SUITE(ArgMaxTest, DataTypes); TYPED_TEST(ArgMaxTest, Simple) { diff --git a/compiler/luci-interpreter/src/kernels/AveragePool2D.cpp b/compiler/luci-interpreter/src/kernels/AveragePool2D.cpp index 5545fb4d4..d3bade9e4 100644 --- a/compiler/luci-interpreter/src/kernels/AveragePool2D.cpp +++ b/compiler/luci-interpreter/src/kernels/AveragePool2D.cpp @@ -18,8 +18,7 @@ #include "kernels/Utils.h" -#include <tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h> -#include <tensorflow/lite/kernels/internal/reference/pooling.h> +#include "PALAveragePool2d.h" #include <stdexcept> @@ -29,8 +28,9 @@ namespace luci_interpreter namespace kernels { -AveragePool2D::AveragePool2D(const Tensor *input, Tensor *output, const Pool2DParams ¶ms) - : KernelWithParams<Pool2DParams>({input}, {output}, params) +AveragePool2D::AveragePool2D(const Tensor *input, Tensor *output, Tensor *scratchpad, + const Pool2DParams ¶ms) + : KernelWithParams<Pool2DParams>({input}, {output, scratchpad}, params) { } @@ -76,6 +76,10 @@ void AveragePool2D::configure() LUCI_INTERPRETER_CHECK(output()->zero_point() == input()->zero_point()); } output()->resize({batches, output_height, output_width, depth}); + + auto scratchpad = getOutputTensors()[1]; + luci_interpreter_pal::SetupScratchpadTensor(scratchpad, input()->element_type(), + getTensorShape(input()), getTensorShape(output())); } void AveragePool2D::execute() const @@ -155,9 +159,14 @@ void AveragePool2D::evalSInt8() const params.quantized_activation_min = activation_min; params.quantized_activation_max = activation_max; - tflite::reference_integer_ops::AveragePool( + auto scratchpad = getOutputTensors()[1]; + int8_t *scratchpad_data = nullptr; + if (scratchpad->is_allocatable()) + scratchpad_data = scratchpad->data<int8_t>(); + + luci_interpreter_pal::AveragePool<int8_t>( params, getTensorShape(input()), getTensorData<int8_t>(input()), getTensorShape(output()), - getTensorData<int8_t>(output())); + getTensorData<int8_t>(output()), getTensorShape(scratchpad), scratchpad_data); } void AveragePool2D::evalSInt16() const diff --git a/compiler/luci-interpreter/src/kernels/AveragePool2D.h b/compiler/luci-interpreter/src/kernels/AveragePool2D.h index b98367f31..2c8fe16e7 100644 --- a/compiler/luci-interpreter/src/kernels/AveragePool2D.h +++ b/compiler/luci-interpreter/src/kernels/AveragePool2D.h @@ -28,7 +28,8 @@ namespace kernels class AveragePool2D : public KernelWithParams<Pool2DParams> { public: - AveragePool2D(const Tensor *input, Tensor *output, const Pool2DParams ¶ms); + AveragePool2D(const Tensor *input, Tensor *output, Tensor *scratchpad, + const Pool2DParams ¶ms); const Tensor *input() const { return _inputs[0]; } Tensor *output() const { return _outputs[0]; } diff --git a/compiler/luci-interpreter/src/kernels/AveragePool2D.test.cpp b/compiler/luci-interpreter/src/kernels/AveragePool2D.test.cpp index 7ed421129..478bfa68e 100644 --- a/compiler/luci-interpreter/src/kernels/AveragePool2D.test.cpp +++ b/compiler/luci-interpreter/src/kernels/AveragePool2D.test.cpp @@ -46,6 +46,7 @@ TEST_F(AveragePool2DTest, Float) Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, ""); Pool2DParams params{}; params.padding = Padding::VALID; @@ -55,8 +56,9 @@ TEST_F(AveragePool2DTest, Float) params.stride_width = 2; params.activation = Activation::RELU6; - AveragePool2D kernel(&input_tensor, &output_tensor, params); + AveragePool2D kernel(&input_tensor, &output_tensor, &scratchpad, params); kernel.configure(); + _memory_manager->allocate_memory(scratchpad); _memory_manager->allocate_memory(output_tensor); kernel.execute(); @@ -78,6 +80,7 @@ TEST_F(AveragePool2DTest, Uint8_0) Tensor input_tensor = makeInputTensor<DataType::U8>( {1, 2, 4, 1}, quant_param.first, quant_param.second, input_data, _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second); + Tensor scratchpad(DataType::U8, Shape({}), {}, ""); Pool2DParams params{}; params.padding = Padding::VALID; @@ -87,8 +90,9 @@ TEST_F(AveragePool2DTest, Uint8_0) params.stride_width = 2; params.activation = Activation::RELU6; - AveragePool2D kernel(&input_tensor, &output_tensor, params); + AveragePool2D kernel(&input_tensor, &output_tensor, &scratchpad, params); kernel.configure(); + _memory_manager->allocate_memory(scratchpad); _memory_manager->allocate_memory(output_tensor); kernel.execute(); @@ -107,6 +111,7 @@ TEST_F(AveragePool2DTest, Uint8_1) Tensor input_tensor = makeInputTensor<DataType::U8>( {1, 2, 4, 1}, quant_param.first, quant_param.second, input_data, _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second); + Tensor scratchpad(DataType::U8, Shape({}), {}, ""); Pool2DParams params{}; params.padding = Padding::VALID; @@ -116,9 +121,10 @@ TEST_F(AveragePool2DTest, Uint8_1) params.stride_width = 2; params.activation = Activation::RELU6; - AveragePool2D kernel(&input_tensor, &output_tensor, params); + AveragePool2D kernel(&input_tensor, &output_tensor, &scratchpad, params); kernel.configure(); _memory_manager->allocate_memory(output_tensor); + _memory_manager->allocate_memory(scratchpad); kernel.execute(); EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear({2.75, 6.0})); @@ -141,6 +147,7 @@ TEST_F(AveragePool2DTest, SInt16) Tensor input_tensor = makeInputTensor<DataType::S16>(input_shape, 0.5, 0, input_data, _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0); + Tensor scratchpad(DataType::S16, Shape({}), {}, ""); Pool2DParams params{}; params.padding = Padding::VALID; @@ -150,8 +157,9 @@ TEST_F(AveragePool2DTest, SInt16) params.stride_width = 2; params.activation = Activation::RELU6; - AveragePool2D kernel(&input_tensor, &output_tensor, params); + AveragePool2D kernel(&input_tensor, &output_tensor, &scratchpad, params); kernel.configure(); + _memory_manager->allocate_memory(scratchpad); _memory_manager->allocate_memory(output_tensor); kernel.execute(); @@ -174,6 +182,7 @@ TEST_F(AveragePool2DTest, SInt8) Tensor input_tensor = makeInputTensor<DataType::S8>( input_shape, quant_param.first, quant_param.second, input_data, _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::S8, quant_param.first, quant_param.second); + Tensor scratchpad(DataType::S8, Shape({}), {}, ""); Pool2DParams params{}; params.padding = Padding::VALID; @@ -183,8 +192,9 @@ TEST_F(AveragePool2DTest, SInt8) params.stride_width = 2; params.activation = Activation::RELU6; - AveragePool2D kernel(&input_tensor, &output_tensor, params); + AveragePool2D kernel(&input_tensor, &output_tensor, &scratchpad, params); kernel.configure(); + _memory_manager->allocate_memory(scratchpad); _memory_manager->allocate_memory(output_tensor); kernel.execute(); @@ -203,6 +213,7 @@ TEST_F(AveragePool2DTest, Invalid_Input_Shape_NEG) Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, ""); Pool2DParams params{}; params.padding = Padding::VALID; @@ -212,7 +223,7 @@ TEST_F(AveragePool2DTest, Invalid_Input_Shape_NEG) params.stride_width = 2; params.activation = Activation::RELU6; - AveragePool2D kernel(&input_tensor, &output_tensor, params); + AveragePool2D kernel(&input_tensor, &output_tensor, &scratchpad, params); EXPECT_ANY_THROW(kernel.configure()); } @@ -227,6 +238,7 @@ TEST_F(AveragePool2DTest, In_Out_Type_NEG) Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::U8); + Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, ""); Pool2DParams params{}; params.padding = Padding::VALID; @@ -236,7 +248,7 @@ TEST_F(AveragePool2DTest, In_Out_Type_NEG) params.stride_width = 2; params.activation = Activation::RELU6; - AveragePool2D kernel(&input_tensor, &output_tensor, params); + AveragePool2D kernel(&input_tensor, &output_tensor, &scratchpad, params); EXPECT_ANY_THROW(kernel.configure()); } @@ -252,6 +264,7 @@ TEST_F(AveragePool2DTest, Quant_Param_NEG) Tensor input_tensor = makeInputTensor<DataType::U8>( {1, 2, 4, 1}, quant_param1.first, quant_param1.second, input_data, _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param2.first, quant_param2.second); + Tensor scratchpad(DataType::U8, Shape({}), {}, ""); Pool2DParams params{}; params.padding = Padding::VALID; @@ -261,7 +274,7 @@ TEST_F(AveragePool2DTest, Quant_Param_NEG) params.stride_width = 2; params.activation = Activation::RELU6; - AveragePool2D kernel(&input_tensor, &output_tensor, params); + AveragePool2D kernel(&input_tensor, &output_tensor, &scratchpad, params); EXPECT_ANY_THROW(kernel.configure()); } diff --git a/compiler/luci-interpreter/src/kernels/BatchMatMul.cpp b/compiler/luci-interpreter/src/kernels/BatchMatMul.cpp new file mode 100644 index 000000000..24ca22996 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/BatchMatMul.cpp @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2020 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/BatchMatMul.h" +#include "kernels/Utils.h" + +#include "PALBatchMatMul.h" + +#include <tensorflow/lite/kernels/internal/reference/transpose.h> + +#include <stdexcept> + +namespace +{ + +tflite::RuntimeShape SwapRowColumnDims(const tflite::RuntimeShape &shape) +{ + tflite::RuntimeShape swapped_shape(shape); + const int32_t dims = shape.DimensionsCount(); + swapped_shape.SetDim(dims - 2, shape.Dims(dims - 1)); + swapped_shape.SetDim(dims - 1, shape.Dims(dims - 2)); + return swapped_shape; +} + +} // namespace + +namespace luci_interpreter +{ +namespace kernels +{ + +BatchMatMul::BatchMatMul(const Tensor *x, const Tensor *y, Tensor *output, Tensor *x_tmp, + Tensor *y_tmp, const BatchMatMulParams ¶ms) + : KernelWithParams({x, y}, {output, x_tmp, y_tmp}, params) +{ +} + +void BatchMatMul::configure() +{ + auto lhs = x(); + auto rhs = y(); + auto adj_x = params().adj_x; + auto adj_y = params().adj_y; + + // TODO Support non-float types + if (lhs->element_type() != DataType::FLOAT32 || rhs->element_type() != DataType::FLOAT32) + throw std::runtime_error("Unsupported type."); + + LUCI_INTERPRETER_CHECK(lhs->element_type() == rhs->element_type()); + + auto lhs_rank = lhs->shape().num_dims(); + auto rhs_rank = rhs->shape().num_dims(); + LUCI_INTERPRETER_CHECK(lhs_rank >= 2 && lhs_rank <= 4); + LUCI_INTERPRETER_CHECK(rhs_rank >= 2 && rhs_rank <= 4); + + auto lhs_scratchpad = temp_lhs(); + auto rhs_scratchpad = temp_rhs(); + luci_interpreter_pal::SetupScratchpadTensor(lhs_scratchpad, rhs_scratchpad, getTensorShape(lhs), + getTensorShape(rhs)); + + auto output_rank = std::max(lhs_rank, rhs_rank); + + auto extended_lhs_shape = tflite::RuntimeShape::ExtendedShape(output_rank, getTensorShape(lhs)); + auto extended_rhs_shape = tflite::RuntimeShape::ExtendedShape(output_rank, getTensorShape(rhs)); + + // Ensure any batch dimensions obey broacasting rules. + for (int i = 0; i < output_rank - 2; ++i) + { + const int lhs_dim = extended_lhs_shape.Dims(i); + const int rhs_dim = extended_rhs_shape.Dims(i); + if (lhs_dim != rhs_dim) + { + if (lhs_dim != 1) + { + LUCI_INTERPRETER_CHECK(rhs_dim == 1); + } + } + } + + // Ensure other dimensions work for matrix multiplication. + int accum_dim_lhs = + adj_x ? extended_lhs_shape.Dims(output_rank - 2) : extended_lhs_shape.Dims(output_rank - 1); + int accum_dim_rhs = + adj_y ? extended_rhs_shape.Dims(output_rank - 1) : extended_rhs_shape.Dims(output_rank - 2); + LUCI_INTERPRETER_CHECK(accum_dim_lhs == accum_dim_rhs); + + Shape output_shape(output_rank); + // Fill in any broadcast dimensions. + for (int i = 0; i < output_rank - 2; ++i) + { + const int lhs_dim = extended_lhs_shape.Dims(i); + const int rhs_dim = extended_rhs_shape.Dims(i); + int broadcast_dim = lhs_dim; + if ((lhs_dim != rhs_dim) && (lhs_dim == 1)) + { + broadcast_dim = rhs_dim; + } + output_shape.dim(i) = broadcast_dim; + } + // Fill in the matmul dimensions. + int lhs_rows_index = adj_x ? output_rank - 1 : output_rank - 2; + int rhs_cols_index = adj_y ? output_rank - 2 : output_rank - 1; + + output_shape.dim(output_rank - 2) = extended_lhs_shape.Dims(lhs_rows_index); + output_shape.dim(output_rank - 1) = extended_rhs_shape.Dims(rhs_cols_index); + + output()->resize(output_shape); +} + +void TransposeRowsColumns(const Tensor *tensor_in, Tensor *tensor_out) +{ + tflite::RuntimeShape transposed_shape(getTensorShape(tensor_in)); + tflite::RuntimeShape shape(getTensorShape(tensor_in)); + tflite::TransposeParams params; + int rank = shape.DimensionsCount(); + params.perm_count = rank; + for (int i = 0; i < rank - 2; ++i) + { + params.perm[i] = i; + } + // Transpose the last two dimensions. + params.perm[rank - 2] = rank - 1; + params.perm[rank - 1] = rank - 2; + transposed_shape.SetDim(rank - 1, shape.Dims(rank - 2)); + transposed_shape.SetDim(rank - 2, shape.Dims(rank - 1)); + switch (tensor_in->element_type()) + { + case DataType::FLOAT32: + tflite::reference_ops::Transpose(params, shape, getTensorData<float>(tensor_in), + transposed_shape, getTensorData<float>(tensor_out)); + break; + default: + throw std::runtime_error("Only suppport fp32 BatchMatMul for now."); + } +} + +void BatchMatMul::execute() const +{ + auto lhs = x(); + auto rhs = y(); + + bool adj_x = params().adj_x; + bool adj_y = params().adj_y; + + auto orig_lhs_shape = getTensorShape(lhs); + auto orig_rhs_shape = getTensorShape(rhs); + + auto rhs_tensor = adj_y ? rhs : temp_rhs(); + auto lhs_tensor = adj_x ? temp_lhs() : lhs; + if (not adj_y) + { + TransposeRowsColumns(rhs, temp_rhs()); + } + if (adj_x) + { + TransposeRowsColumns(lhs, temp_lhs()); + } + tflite::RuntimeShape rhs_shape = adj_y ? orig_rhs_shape : SwapRowColumnDims(orig_rhs_shape); + tflite::RuntimeShape lhs_shape = adj_x ? orig_lhs_shape : SwapRowColumnDims(orig_lhs_shape); + + switch (x()->element_type()) + { + case DataType::FLOAT32: + luci_interpreter_pal::BatchMatMul(rhs_shape, getTensorData<float>(rhs_tensor), lhs_shape, + getTensorData<float>(lhs_tensor), getTensorShape(output()), + getTensorData<float>(output())); + break; + default: + throw std::runtime_error("Unsupported type."); + } +} + +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/BatchMatMul.h b/compiler/luci-interpreter/src/kernels/BatchMatMul.h new file mode 100644 index 000000000..744f49795 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/BatchMatMul.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_KERNELS_BATCHMATMUL_H +#define LUCI_INTERPRETER_KERNELS_BATCHMATMUL_H + +#include "core/Kernel.h" +#include "core/KernelParams.h" + +namespace luci_interpreter +{ +namespace kernels +{ + +class BatchMatMul : public KernelWithParams<BatchMatMulParams> +{ +public: + BatchMatMul(const Tensor *x, const Tensor *y, Tensor *output, Tensor *x_tmp, Tensor *y_tmp, + const BatchMatMulParams ¶ms); + + const Tensor *x() const { return _inputs[0]; } + const Tensor *y() const { return _inputs[1]; } + Tensor *output() const { return _outputs[0]; } + + void configure() override; + void execute() const override; + +private: + Tensor *temp_lhs() const { return _outputs[1]; } + Tensor *temp_rhs() const { return _outputs[2]; } +}; + +} // namespace kernels +} // namespace luci_interpreter + +#endif // LUCI_INTERPRETER_KERNELS_BATCHMATMUL_H diff --git a/compiler/luci-interpreter/src/kernels/BatchMatMul.test.cpp b/compiler/luci-interpreter/src/kernels/BatchMatMul.test.cpp new file mode 100644 index 000000000..edfa3a685 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/BatchMatMul.test.cpp @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/BatchMatMul.h" +#include "kernels/TestUtils.h" +#include "luci_interpreter/TestMemoryManager.h" + +namespace luci_interpreter +{ +namespace kernels +{ +namespace +{ + +using namespace testing; + +class BatchMatMulTest : public ::testing::Test +{ +protected: + void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); } + + std::unique_ptr<IMemoryManager> _memory_manager; +}; + +TEST_F(BatchMatMulTest, Float) +{ + std::vector<float> lhs_data = {1, 2, 3, 4, 5, 6}; + std::vector<float> rhs_data = {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; + Tensor lhs_tensor = + makeInputTensor<DataType::FLOAT32>({1, 2, 3}, lhs_data, _memory_manager.get()); + Tensor rhs_tensor = + makeInputTensor<DataType::FLOAT32>({1, 3, 4}, rhs_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + + BatchMatMulParams params; + params.adj_x = false; + params.adj_y = false; + + BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params); + kernel.configure(); + _memory_manager->allocate_memory(lhs_scratch); + _memory_manager->allocate_memory(rhs_scratch); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<float>(output_tensor), + FloatArrayNear({74., 80., 86., 92., 173., 188., 203., 218.})); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4})); +} + +TEST_F(BatchMatMulTest, Float_SimpleRHSAdjoint) +{ + std::vector<float> lhs_data = {1, 2, 3, 4, 5, 6}; + std::vector<float> rhs_data = {7, 11, 15, 8, 12, 16, 9, 13, 17, 10, 14, 18}; + Tensor lhs_tensor = + makeInputTensor<DataType::FLOAT32>({1, 2, 3}, lhs_data, _memory_manager.get()); + Tensor rhs_tensor = + makeInputTensor<DataType::FLOAT32>({1, 4, 3}, rhs_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + + BatchMatMulParams params; + params.adj_x = false; + params.adj_y = true; + + BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params); + kernel.configure(); + _memory_manager->allocate_memory(lhs_scratch); + _memory_manager->allocate_memory(rhs_scratch); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<float>(output_tensor), + FloatArrayNear({74., 80., 86., 92., 173., 188., 203., 218.})); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4})); +} + +TEST_F(BatchMatMulTest, Float_SimpleLHSAdjoint) +{ + std::vector<float> lhs_data = {1, 4, 2, 5, 3, 6}; + std::vector<float> rhs_data = {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; + Tensor lhs_tensor = + makeInputTensor<DataType::FLOAT32>({1, 3, 2}, lhs_data, _memory_manager.get()); + Tensor rhs_tensor = + makeInputTensor<DataType::FLOAT32>({1, 3, 4}, rhs_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + + BatchMatMulParams params; + params.adj_x = true; + params.adj_y = false; + + BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params); + kernel.configure(); + _memory_manager->allocate_memory(lhs_scratch); + _memory_manager->allocate_memory(rhs_scratch); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<float>(output_tensor), + FloatArrayNear({74., 80., 86., 92., 173., 188., 203., 218.})); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 4})); +} + +TEST_F(BatchMatMulTest, Float_BatchSizeTwo) +{ + std::vector<float> lhs_data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + std::vector<float> rhs_data = {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30}; + Tensor lhs_tensor = + makeInputTensor<DataType::FLOAT32>({2, 2, 3}, lhs_data, _memory_manager.get()); + Tensor rhs_tensor = + makeInputTensor<DataType::FLOAT32>({2, 3, 4}, rhs_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + + BatchMatMulParams params; + params.adj_x = false; + params.adj_y = false; + + BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params); + kernel.configure(); + _memory_manager->allocate_memory(lhs_scratch); + _memory_manager->allocate_memory(rhs_scratch); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<float>(output_tensor), + FloatArrayNear({74., 80., 86., 92., 173., 188., 203., 218., 560., 584., 608., 632., + 767., 800., 833., 866.})); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 2, 4})); +} + +TEST_F(BatchMatMulTest, Float_DiffBatch) +{ + std::vector<float> lhs_data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + std::vector<float> rhs_data = {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30}; + Tensor lhs_tensor = + makeInputTensor<DataType::FLOAT32>({2, 1, 6}, lhs_data, _memory_manager.get()); + Tensor rhs_tensor = + makeInputTensor<DataType::FLOAT32>({1, 6, 4}, rhs_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + + BatchMatMulParams params; + params.adj_x = false; + params.adj_y = false; + + BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params); + kernel.configure(); + _memory_manager->allocate_memory(lhs_scratch); + _memory_manager->allocate_memory(rhs_scratch); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<float>(output_tensor), + FloatArrayNear({427., 448., 469., 490., 1039., 1096., 1153., 1210.})); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 1, 4})); +} + +TEST_F(BatchMatMulTest, Invalid_Shape_NEG) +{ + Tensor lhs_tensor = + makeInputTensor<DataType::FLOAT32>({1, 2, 2}, {1, 2, 3, 4}, _memory_manager.get()); + Tensor rhs_tensor = + makeInputTensor<DataType::FLOAT32>({1, 3, 2}, {5, 6, 7, 8, 9, 10}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + + BatchMatMulParams params; + params.adj_x = false; + params.adj_y = false; + + BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(BatchMatMulTest, Invalid_Batch_NEG) +{ + Tensor lhs_tensor = + makeInputTensor<DataType::FLOAT32>({2, 1, 3}, {1, 2, 3, 4, 5, 6}, _memory_manager.get()); + Tensor rhs_tensor = makeInputTensor<DataType::FLOAT32>({3, 3, 1}, {5, 6, 7, 8, 9, 10, 11, 12, 13}, + _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + + BatchMatMulParams params; + params.adj_x = false; + params.adj_y = false; + + BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(BatchMatMulTest, Invalid_Rank_NEG) +{ + Tensor lhs_tensor = makeInputTensor<DataType::FLOAT32>({4}, {1, 2, 3, 4}, _memory_manager.get()); + Tensor rhs_tensor = makeInputTensor<DataType::FLOAT32>({1, 4, 2}, {5, 6, 7, 8, 9, 10, 11, 12}, + _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + + BatchMatMulParams params; + params.adj_x = false; + params.adj_y = false; + + BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(BatchMatMulTest, Invalid_Rank2_NEG) +{ + Tensor lhs_tensor = + makeInputTensor<DataType::FLOAT32>({1, 1, 1, 1, 4}, {1, 2, 3, 4}, _memory_manager.get()); + Tensor rhs_tensor = makeInputTensor<DataType::FLOAT32>({1, 4, 2}, {5, 6, 7, 8, 9, 10, 11, 12}, + _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + Tensor lhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + + BatchMatMulParams params; + params.adj_x = false; + params.adj_y = false; + + BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(BatchMatMulTest, TypeMisMatch_NEG) +{ + Tensor lhs_tensor = + makeInputTensor<DataType::U8>({1, 2, 3}, {1, 2, 3, 4, 5, 6}, _memory_manager.get()); + Tensor rhs_tensor = + makeInputTensor<DataType::FLOAT32>({1, 3, 2}, {5, 6, 7, 8, 9, 10}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + Tensor lhs_scratch(DataType::U8, Shape({}), {}, ""); + Tensor rhs_scratch(DataType::FLOAT32, Shape({}), {}, ""); + + BatchMatMulParams params; + params.adj_x = false; + params.adj_y = false; + + BatchMatMul kernel(&lhs_tensor, &rhs_tensor, &output_tensor, &lhs_scratch, &rhs_scratch, params); + EXPECT_ANY_THROW(kernel.configure()); +} + +} // namespace +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/BatchToSpaceND.test.cpp b/compiler/luci-interpreter/src/kernels/BatchToSpaceND.test.cpp index f3a344974..52647a763 100644 --- a/compiler/luci-interpreter/src/kernels/BatchToSpaceND.test.cpp +++ b/compiler/luci-interpreter/src/kernels/BatchToSpaceND.test.cpp @@ -58,7 +58,7 @@ template <typename T> class BatchToSpaceNDTest : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t>; -TYPED_TEST_CASE(BatchToSpaceNDTest, DataTypes); +TYPED_TEST_SUITE(BatchToSpaceNDTest, DataTypes); TYPED_TEST(BatchToSpaceNDTest, Simple) { diff --git a/compiler/luci-interpreter/src/kernels/CMakeLists.txt b/compiler/luci-interpreter/src/kernels/CMakeLists.txt index 1b7d0f66a..9f4ba0e0b 100644 --- a/compiler/luci-interpreter/src/kernels/CMakeLists.txt +++ b/compiler/luci-interpreter/src/kernels/CMakeLists.txt @@ -15,7 +15,9 @@ endmacro(REGISTER_KERNEL) include(${KERNEL_REGISTER_FILE}) add_library(${LUCI_INTERPRETER_KERNELS} STATIC ${SOURCES}) -set_target_properties(${LUCI_INTERPRETER_KERNELS} PROPERTIES POSITION_INDEPENDENT_CODE ON) +if (NOT NNCC_LIBRARY_NO_PIC) + set_target_properties(${LUCI_INTERPRETER_KERNELS} PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif(NOT NNCC_LIBRARY_NO_PIC) target_include_directories(${LUCI_INTERPRETER_KERNELS} PUBLIC ${LUCI_INTERPRETER_SOURCE_DIR}) target_link_libraries(${LUCI_INTERPRETER_KERNELS} PUBLIC ${LUCI_INTERPRETER_CORE}) diff --git a/compiler/luci-interpreter/src/kernels/Cast.test.cpp b/compiler/luci-interpreter/src/kernels/Cast.test.cpp index 731260522..4713ad34c 100644 --- a/compiler/luci-interpreter/src/kernels/Cast.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Cast.test.cpp @@ -79,7 +79,7 @@ template <typename T> class CastTest : public ::testing::Test using IntDataTypes = ::testing::Types<uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t>; -TYPED_TEST_CASE(CastTest, IntDataTypes); +TYPED_TEST_SUITE(CastTest, IntDataTypes); TYPED_TEST(CastTest, FloatToInt) { diff --git a/compiler/luci-interpreter/src/kernels/Concatenation.cpp b/compiler/luci-interpreter/src/kernels/Concatenation.cpp index 7cfdf34b9..46ee5941e 100644 --- a/compiler/luci-interpreter/src/kernels/Concatenation.cpp +++ b/compiler/luci-interpreter/src/kernels/Concatenation.cpp @@ -69,11 +69,21 @@ void Concatenation::configure() Shape output_shape = t0->shape(); output_shape.dim(axis) = sum_axis; - // TODO S8 type needs more checking: quantization parameters of all input tensors and the output - // tensor should be the same. Note that there is no such requirement for U8 type. - if (t0->element_type() == DataType::S8) - throw std::runtime_error("Unsupported type."); + // If input tensors are INT8 type then quantization parameters of all input tensors and the output + // should be the same + for (auto current_tensor : _inputs) + { + if (current_tensor->element_type() == DataType::S8) + { + LUCI_INTERPRETER_CHECK(current_tensor->quantized_dimension() == + output()->quantized_dimension()); + LUCI_INTERPRETER_CHECK(current_tensor->zero_points().size() == + current_tensor->scales().size()); + LUCI_INTERPRETER_CHECK(current_tensor->zero_points() == output()->zero_points()); + LUCI_INTERPRETER_CHECK(current_tensor->scales() == output()->scales()); + } + } output()->resize(output_shape); } diff --git a/compiler/luci-interpreter/src/kernels/Concatenation.test.cpp b/compiler/luci-interpreter/src/kernels/Concatenation.test.cpp index e4b50611a..f893b38fd 100644 --- a/compiler/luci-interpreter/src/kernels/Concatenation.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Concatenation.test.cpp @@ -183,12 +183,12 @@ TEST_F(ConcatenationTest, Mismatching_Input_Dimension_NEG) EXPECT_ANY_THROW(kernel.configure()); } -TEST_F(ConcatenationTest, Unsupported_Configure_Type_NEG) +TEST_F(ConcatenationTest, Int8_Mismatching_Input_Type_NEG) { - std::vector<int8_t> input1_data{1, 2, 3, 4, 5, 6}; - std::vector<int8_t> input2_data{7, 8, 9, 10, 11, 12}; - Tensor input1_tensor = makeInputTensor<DataType::S8>({2, 3}, input1_data, _memory_manager.get()); - Tensor input2_tensor = makeInputTensor<DataType::S8>({2, 3}, input2_data, _memory_manager.get()); + std::vector<uint8_t> input1_data{1, 2, 3, 4}; + std::vector<int8_t> input2_data{5, 6, 7, 8}; + Tensor input1_tensor = makeInputTensor<DataType::U8>({2, 2}, input1_data, _memory_manager.get()); + Tensor input2_tensor = makeInputTensor<DataType::S8>({2, 2}, input2_data, _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::S8); ConcatenationParams params{}; @@ -199,6 +199,51 @@ TEST_F(ConcatenationTest, Unsupported_Configure_Type_NEG) EXPECT_ANY_THROW(kernel.configure()); } +TEST_F(ConcatenationTest, Int8_Mismatching_Input_Output_Quant_Params_NEG) +{ + std::vector<float> input1_data{1, 2, 3, 4, 5, 6}; + std::vector<float> input2_data{7, 8, 9, 10, 11, 12}; + int quantized_dimension = 3; + std::vector<float> scales{0.1, 0.2, 0.3}; + std::vector<int32_t> zero_points{1, -1, 1}; + + Tensor input1_tensor = makeInputTensor<DataType::S8>( + {1, 1, 2, 3}, scales, zero_points, quantized_dimension, input1_data, _memory_manager.get()); + Tensor input2_tensor = makeInputTensor<DataType::S8>( + {1, 1, 2, 3}, scales, zero_points, quantized_dimension, input2_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S8, scales.at(0), zero_points.at(0)); + ConcatenationParams params{}; + + params.axis = -1; + params.activation = luci::FusedActFunc::NONE; + + Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(ConcatenationTest, Int8_Mismatching_Zero_Point_NEG) +{ + std::vector<float> input1_data{1, 2, 3, 4}; + std::vector<float> input2_data{5, 6, 7, 8}; + float scale = 0.1; + int32_t zero_point_1 = 1; + int32_t zero_point_2 = -1; + + Tensor input1_tensor = + makeInputTensor<DataType::S8>({2, 2}, scale, zero_point_1, input1_data, _memory_manager.get()); + Tensor input2_tensor = + makeInputTensor<DataType::S8>({2, 2}, scale, zero_point_2, input2_data, _memory_manager.get()); + + Tensor output_tensor = makeOutputTensor(DataType::S8, scale, zero_point_1); + ConcatenationParams params{}; + + params.axis = -1; + params.activation = luci::FusedActFunc::NONE; + + Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params); + EXPECT_ANY_THROW(kernel.configure()); +} + // TODO: Remove this test when concat w/ fused_activation is supported TEST_F(ConcatenationTest, With_Fused_Activation_NEG) { diff --git a/compiler/luci-interpreter/src/kernels/Conv2D.cpp b/compiler/luci-interpreter/src/kernels/Conv2D.cpp index 5647f4c44..234f95425 100644 --- a/compiler/luci-interpreter/src/kernels/Conv2D.cpp +++ b/compiler/luci-interpreter/src/kernels/Conv2D.cpp @@ -30,8 +30,8 @@ namespace kernels { Conv2D::Conv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output, - Tensor *im2col, const Conv2DParams ¶ms) - : KernelWithParams<Conv2DParams>({input, filter, bias}, {output, im2col}, params) + Tensor *scratchpad, const Conv2DParams ¶ms) + : KernelWithParams<Conv2DParams>({input, filter, bias}, {output, scratchpad}, params) { } @@ -108,27 +108,18 @@ void Conv2D::configure() output()->resize({batches, output_height, output_width, output_depth}); - // Allocate tensor for Im2Col, if needed. - // The checks here should be aligned with the actual implementation. - const bool need_dilated_im2col = - _params.dilation_height_factor != 1 || _params.dilation_width_factor != 1; - const bool need_non_dilated_im2col = _params.stride_height != 1 || _params.stride_width != 1 || - filter_height != 1 || filter_width != 1; - _need_im2col = - input()->element_type() != DataType::S16 && (need_dilated_im2col || need_non_dilated_im2col); - if (_need_im2col) - { - const int input_depth = input_shape.dim(3); - Shape im2col_shape{batches, output_height, output_width, - input_depth * filter_height * filter_width}; - auto im2col = getOutputTensors()[1]; - im2col->resize(im2col_shape); - } - else - { - auto im2col = getOutputTensors()[1]; - im2col->set_allocatable(false); - } + // Allocate tensor for scratchpad, if needed. + tflite::ConvParams params{}; + params.padding_values.height = _padding_height; + params.padding_values.width = _padding_width; + params.stride_height = _params.stride_height; + params.stride_width = _params.stride_width; + params.dilation_height_factor = _params.dilation_height_factor; + params.dilation_width_factor = _params.dilation_width_factor; + auto scratchpad = getOutputTensors()[1]; + luci_interpreter_pal::SetupScratchpadTensor(scratchpad, input()->element_type(), params, + getTensorShape(input()), getTensorShape(filter()), + getTensorShape(output())); switch (_params.activation) { @@ -193,16 +184,16 @@ void Conv2D::evalFloat() const params.float_activation_min = activation_min; params.float_activation_max = activation_max; - float *im2col_data = nullptr; - auto im2col = getOutputTensors()[1]; - if (_need_im2col) - { - im2col_data = im2col->data<float>(); - } - luci_interpreter_pal::Conv( - params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()), - getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()), - getTensorShape(output()), getTensorData<float>(output()), getTensorShape(im2col), im2col_data); + auto scratchpad = getOutputTensors()[1]; + float *scratchpad_data = nullptr; + if (scratchpad->is_allocatable()) + scratchpad_data = scratchpad->data<float>(); + + luci_interpreter_pal::Conv(params, getTensorShape(input()), getTensorData<float>(input()), + getTensorShape(filter()), getTensorData<float>(filter()), + getTensorShape(bias()), getTensorData<float>(bias()), + getTensorShape(output()), getTensorData<float>(output()), + getTensorShape(scratchpad), scratchpad_data); } void Conv2D::evalQuantized() const @@ -236,12 +227,12 @@ void Conv2D::evalQuantized() const params.quantized_activation_min = activation_min; params.quantized_activation_max = activation_max; - auto im2col = getOutputTensors()[1]; + auto scratchpad = getOutputTensors()[1]; luci_interpreter_pal::Conv(params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(filter()), getTensorData<uint8_t>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()), getTensorShape(output()), getTensorData<uint8_t>(output()), - getTensorShape(im2col), getTensorData<uint8_t>(im2col)); + getTensorShape(scratchpad), getTensorData<uint8_t>(scratchpad)); } void Conv2D::evalQuantizedPerChannel() const @@ -364,18 +355,16 @@ void Conv2D::evalQuantizedS8PerChannel() const std::back_inserter(multipliers), [](ChannelQuantMultipliers cm) { return cm.multiplier; }); - int8_t *im2col_data = nullptr; - auto im2col = getOutputTensors()[1]; - if (_need_im2col) - { - im2col_data = im2col->data<int8_t>(); - } + auto scratchpad = getOutputTensors()[1]; + int8_t *scratchpad_data = nullptr; + if (scratchpad->is_allocatable()) + scratchpad_data = scratchpad->data<int8_t>(); luci_interpreter_pal::ConvPerChannel( params, multipliers.data(), shifts.data(), getTensorShape(input()), getTensorData<int8_t>(input()), getTensorShape(filter()), getTensorData<int8_t>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()), getTensorShape(output()), - getTensorData<int8_t>(output()), getTensorShape(im2col), im2col_data); + getTensorData<int8_t>(output()), getTensorShape(scratchpad), scratchpad_data); } void Conv2D::evalQuantizedS16() const diff --git a/compiler/luci-interpreter/src/kernels/Conv2D.h b/compiler/luci-interpreter/src/kernels/Conv2D.h index 5f1317638..330bf3a2a 100644 --- a/compiler/luci-interpreter/src/kernels/Conv2D.h +++ b/compiler/luci-interpreter/src/kernels/Conv2D.h @@ -31,7 +31,7 @@ class Conv2D : public KernelWithParams<Conv2DParams> { public: Conv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output, - Tensor *im2col, const Conv2DParams ¶ms); + Tensor *scratchpad, const Conv2DParams ¶ms); const Tensor *input() const { return _inputs[0]; } const Tensor *filter() const { return _inputs[1]; } @@ -49,7 +49,6 @@ private: void evalQuantizedS16() const; private: - bool _need_im2col = false; int32_t _padding_height{}; int32_t _padding_width{}; }; diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp index 9b1c09ba9..88e6e07f1 100644 --- a/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp +++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp @@ -32,7 +32,7 @@ template <typename T> class DepthToSpaceTest : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t>; -TYPED_TEST_CASE(DepthToSpaceTest, DataTypes); +TYPED_TEST_SUITE(DepthToSpaceTest, DataTypes); TYPED_TEST(DepthToSpaceTest, SimpleCase) { diff --git a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp index f2dbf6c68..c554c309d 100644 --- a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp +++ b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp @@ -18,9 +18,7 @@ #include "kernels/Utils.h" -#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h> -#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h> -#include <tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h> +#include "PALDepthwiseConv2d.h" #include <stdexcept> @@ -30,8 +28,9 @@ namespace kernels { DepthwiseConv2D::DepthwiseConv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, - Tensor *output, const DepthwiseConv2DParams ¶ms) - : KernelWithParams<DepthwiseConv2DParams>({input, filter, bias}, {output}, params) + Tensor *output, Tensor *scratchpad, + const DepthwiseConv2DParams ¶ms) + : KernelWithParams<DepthwiseConv2DParams>({input, filter, bias}, {output, scratchpad}, params) { } @@ -109,6 +108,16 @@ void DepthwiseConv2D::configure() filter_width, output_width); output()->resize({batches, output_height, output_width, channels_out}); + + tflite::DepthwiseParams params{}; + + params.dilation_height_factor = _params.dilation_height_factor; + params.dilation_width_factor = _params.dilation_width_factor; + + auto scratchpad = getOutputTensors()[1]; + luci_interpreter_pal::SetupScratchpadTensor(scratchpad, params, input()->element_type(), + getTensorShape(input()), getTensorShape(filter()), + getTensorShape(output())); } void DepthwiseConv2D::execute() const @@ -337,11 +346,16 @@ void DepthwiseConv2D::evalQuantizedS8PerChannel() const std::back_inserter(multipliers), [](ChannelQuantMultipliers cm) { return cm.multiplier; }); - tflite::reference_integer_ops::DepthwiseConvPerChannel( + auto scratchpad = getOutputTensors()[1]; + int8_t *scratchpad_data = nullptr; + if (scratchpad->is_allocatable()) + scratchpad_data = scratchpad->data<int8_t>(); + + luci_interpreter_pal::DepthwiseConvPerChannel<int8_t>( params, multipliers.data(), shifts.data(), getTensorShape(input()), getTensorData<int8_t>(input()), getTensorShape(filter()), getTensorData<int8_t>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()), getTensorShape(output()), - getTensorData<int8_t>(output())); + getTensorData<int8_t>(output()), getTensorShape(scratchpad), scratchpad_data); } void DepthwiseConv2D::evalQuantizedS16() const diff --git a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.h b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.h index 6cffd6583..3d1faf6c1 100644 --- a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.h +++ b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.h @@ -29,7 +29,7 @@ class DepthwiseConv2D : public KernelWithParams<DepthwiseConv2DParams> { public: DepthwiseConv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output, - const DepthwiseConv2DParams ¶ms); + Tensor *scratchpad, const DepthwiseConv2DParams ¶ms); const Tensor *input() const { return _inputs[0]; } const Tensor *filter() const { return _inputs[1]; } diff --git a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp index 74975899a..6b4673f3e 100644 --- a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp +++ b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp @@ -59,6 +59,7 @@ TEST_F(DepthwiseConv2DTest, Float) makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get()); Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get()); + Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, ""); Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); DepthwiseConv2DParams params{}; @@ -70,8 +71,10 @@ TEST_F(DepthwiseConv2DTest, Float) params.dilation_width_factor = 1; params.activation = Activation::RELU; - DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params); + DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad, + params); kernel.configure(); + _memory_manager->allocate_memory(scratchpad); _memory_manager->allocate_memory(output_tensor); kernel.execute(); @@ -111,6 +114,7 @@ TEST_F(DepthwiseConv2DTest, Uint8) {4}, input_quant_param.first * input_quant_param.first, 0, bias_data, _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second); + Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, ""); DepthwiseConv2DParams params{}; params.padding = Padding::VALID; @@ -121,9 +125,11 @@ TEST_F(DepthwiseConv2DTest, Uint8) params.dilation_width_factor = 1; params.activation = Activation::NONE; - DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params); + DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad, + params); kernel.configure(); _memory_manager->allocate_memory(output_tensor); + _memory_manager->allocate_memory(scratchpad); kernel.execute(); std::vector<float> ref_output_data{ @@ -166,6 +172,7 @@ TEST_F(DepthwiseConv2DTest, SInt16) Tensor bias_tensor = makeInputTensor<DataType::S64>(bias_shape, 0.25 * 0.2, 0, bias_data, _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0); + Tensor scratchpad(DataType::S64, Shape({}), {}, ""); DepthwiseConv2DParams params{}; params.padding = Padding::VALID; @@ -176,9 +183,11 @@ TEST_F(DepthwiseConv2DTest, SInt16) params.dilation_width_factor = 1; params.activation = Activation::RELU; - DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params); + DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad, + params); kernel.configure(); _memory_manager->allocate_memory(output_tensor); + _memory_manager->allocate_memory(scratchpad); kernel.execute(); EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape)); @@ -224,6 +233,7 @@ TEST_F(DepthwiseConv2DTest, SInt16_CWQ_weights) Tensor bias_tensor = makeInputTensor<DataType::S64>(bias_shape, bias_scales, zerop, 0, bias_data, _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0); + Tensor scratchpad(DataType::S16, Shape({}), {}, ""); DepthwiseConv2DParams params{}; params.padding = Padding::VALID; @@ -234,9 +244,11 @@ TEST_F(DepthwiseConv2DTest, SInt16_CWQ_weights) params.dilation_width_factor = 1; params.activation = Activation::RELU; - DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params); + DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad, + params); kernel.configure(); _memory_manager->allocate_memory(output_tensor); + _memory_manager->allocate_memory(scratchpad); kernel.execute(); EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape)); @@ -299,6 +311,7 @@ TEST_F(DepthwiseConv2DTest, Uint8_CWQ_weights) _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second); + Tensor scratchpad(DataType::U8, Shape({}), {}, ""); DepthwiseConv2DParams params{}; params.padding = Padding::VALID; @@ -309,9 +322,11 @@ TEST_F(DepthwiseConv2DTest, Uint8_CWQ_weights) params.dilation_width_factor = 1; params.activation = Activation::NONE; - DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params); + DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad, + params); kernel.configure(); _memory_manager->allocate_memory(output_tensor); + _memory_manager->allocate_memory(scratchpad); kernel.execute(); EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape)); @@ -375,6 +390,7 @@ TEST_F(DepthwiseConv2DTest, SInt8_CWQ_weights) _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::S8, output_quant_param.first, output_quant_param.second); + Tensor scratchpad(DataType::S8, Shape({}), {}, ""); DepthwiseConv2DParams params{}; params.padding = Padding::VALID; @@ -385,9 +401,11 @@ TEST_F(DepthwiseConv2DTest, SInt8_CWQ_weights) params.dilation_width_factor = 1; params.activation = Activation::NONE; - DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params); + DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad, + params); kernel.configure(); _memory_manager->allocate_memory(output_tensor); + _memory_manager->allocate_memory(scratchpad); kernel.execute(); EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape)); @@ -419,6 +437,7 @@ TEST_F(DepthwiseConv2DTest, InvalidBiasType_NEG) makeInputTensor<DataType::FLOAT32>(filter_shape, filter_data, _memory_manager.get()); Tensor bias_tensor = makeInputTensor<DataType::S32>(bias_shape, bias_data, _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, ""); DepthwiseConv2DParams params{}; params.padding = Padding::VALID; @@ -429,7 +448,8 @@ TEST_F(DepthwiseConv2DTest, InvalidBiasType_NEG) params.dilation_width_factor = 1; params.activation = Activation::RELU; - DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params); + DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad, + params); EXPECT_ANY_THROW(kernel.configure()); } @@ -458,6 +478,7 @@ TEST_F(DepthwiseConv2DTest, InOutTypeMismatch_NEG) Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::U8); + Tensor scratchpad(DataType::U8, Shape({}), {}, ""); DepthwiseConv2DParams params{}; params.padding = Padding::VALID; @@ -468,7 +489,8 @@ TEST_F(DepthwiseConv2DTest, InOutTypeMismatch_NEG) params.dilation_width_factor = 1; params.activation = Activation::RELU; - DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params); + DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad, + params); EXPECT_ANY_THROW(kernel.configure()); } @@ -497,6 +519,7 @@ TEST_F(DepthwiseConv2DTest, InvalidInputShape_NEG) Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, ""); DepthwiseConv2DParams params{}; params.padding = Padding::VALID; @@ -507,7 +530,8 @@ TEST_F(DepthwiseConv2DTest, InvalidInputShape_NEG) params.dilation_width_factor = 1; params.activation = Activation::RELU; - DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params); + DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad, + params); EXPECT_ANY_THROW(kernel.configure()); } @@ -536,6 +560,7 @@ TEST_F(DepthwiseConv2DTest, InvalidFilterShape_NEG) Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, ""); DepthwiseConv2DParams params{}; params.padding = Padding::VALID; @@ -546,7 +571,8 @@ TEST_F(DepthwiseConv2DTest, InvalidFilterShape_NEG) params.dilation_width_factor = 1; params.activation = Activation::RELU; - DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params); + DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad, + params); EXPECT_ANY_THROW(kernel.configure()); } @@ -575,6 +601,7 @@ TEST_F(DepthwiseConv2DTest, InvalidBiasDim_NEG) Tensor bias_tensor = makeInputTensor<DataType::FLOAT32>(bias_shape, bias_data, _memory_manager.get()); Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + Tensor scratchpad(DataType::FLOAT32, Shape({}), {}, ""); DepthwiseConv2DParams params{}; params.padding = Padding::VALID; @@ -585,7 +612,8 @@ TEST_F(DepthwiseConv2DTest, InvalidBiasDim_NEG) params.dilation_width_factor = 1; params.activation = Activation::RELU; - DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, params); + DepthwiseConv2D kernel(&input_tensor, &filter_tensor, &bias_tensor, &output_tensor, &scratchpad, + params); EXPECT_ANY_THROW(kernel.configure()); } diff --git a/compiler/luci-interpreter/src/kernels/Dequantize.cpp b/compiler/luci-interpreter/src/kernels/Dequantize.cpp new file mode 100644 index 000000000..96399e5c7 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/Dequantize.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/Dequantize.h" +#include "kernels/Utils.h" +#include "PALDequantize.h" + +namespace luci_interpreter +{ +namespace kernels +{ + +Dequantize::Dequantize(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {} + +void Dequantize::configure() +{ + LUCI_INTERPRETER_CHECK(input()->element_type() == loco::DataType::S8 || + input()->element_type() == loco::DataType::U8 || + input()->element_type() == loco::DataType::S16); + + LUCI_INTERPRETER_CHECK(input()->scales().size() == 1); + + if (input()->element_type() == loco::DataType::S16) + LUCI_INTERPRETER_CHECK(input()->zero_point() == 0); + + LUCI_INTERPRETER_CHECK(output()->element_type() == loco::DataType::FLOAT32); + + output()->resize(input()->shape()); +} + +void Dequantize::execute() const +{ + tflite::DequantizationParams op_params; + op_params.zero_point = input()->zero_point(); + op_params.scale = input()->scale(); + + switch (input()->element_type()) + { + case loco::DataType::U8: + { + luci_interpreter_pal::Dequantize(op_params, getTensorShape(input()), + getTensorData<uint8_t>(input()), getTensorShape(output()), + getTensorData<float>(output())); + break; + } + case loco::DataType::S8: + { + luci_interpreter_pal::Dequantize(op_params, getTensorShape(input()), + getTensorData<int8_t>(input()), getTensorShape(output()), + getTensorData<float>(output())); + break; + } + case loco::DataType::S16: + { + luci_interpreter_pal::Dequantize(op_params, getTensorShape(input()), + getTensorData<int16_t>(input()), getTensorShape(output()), + getTensorData<float>(output())); + break; + } + default: + throw std::runtime_error("Unsupported type."); + } +} + +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/Dequantize.h b/compiler/luci-interpreter/src/kernels/Dequantize.h new file mode 100644 index 000000000..5565df0e4 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/Dequantize.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_KERNELS_DEQUANTIZE_H +#define LUCI_INTERPRETER_KERNELS_DEQUANTIZE_H + +#include "core/Kernel.h" +#include "core/KernelParams.h" + +namespace luci_interpreter +{ +namespace kernels +{ + +class Dequantize : public Kernel +{ +public: + Dequantize(const Tensor *input, Tensor *output); + + const Tensor *input() const { return _inputs[0]; } + Tensor *output() const { return _outputs[0]; } + + void configure() override; + void execute() const override; +}; + +} // namespace kernels +} // namespace luci_interpreter + +#endif // LUCI_INTERPRETER_KERNELS_DEQUANTIZE_H diff --git a/compiler/luci-interpreter/src/kernels/Dequantize.test.cpp b/compiler/luci-interpreter/src/kernels/Dequantize.test.cpp new file mode 100644 index 000000000..0cab633d6 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/Dequantize.test.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/Dequantize.h" +#include "kernels/TestUtils.h" +#include "luci_interpreter/TestMemoryManager.h" + +namespace luci_interpreter +{ +namespace kernels +{ +namespace +{ + +using namespace testing; + +class DequantizeTest : public ::testing::Test +{ +protected: + void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); } + + std::unique_ptr<IMemoryManager> _memory_manager; +}; + +TEST_F(DequantizeTest, Uint8) +{ + std::vector<uint8_t> input_data{0, 1, 2, 3, 4, 251, 252, 253, 254, 255}; + + std::vector<float> ref_output_data{-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64}; + + Tensor input_tensor(loco::DataType::U8, {2, 5}, {{0.5}, {127}}, ""); + + _memory_manager->allocate_memory(input_tensor); + input_tensor.writeData(input_data.data(), input_data.size() * sizeof(uint8_t)); + + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + Dequantize kernel(&input_tensor, &output_tensor); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<float>(output_tensor), + ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 5})); +} + +TEST_F(DequantizeTest, Sint8) +{ + std::vector<int8_t> input_data{-128, -127, -126, -125, -124, 123, 124, 125, 126, 127}; + + std::vector<float> ref_output_data{-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64}; + + Tensor input_tensor(loco::DataType::S8, {2, 5}, {{0.5}, {-1}}, ""); + + _memory_manager->allocate_memory(input_tensor); + input_tensor.writeData(input_data.data(), input_data.size() * sizeof(int8_t)); + + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + Dequantize kernel(&input_tensor, &output_tensor); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<float>(output_tensor), + ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 5})); +} + +TEST_F(DequantizeTest, Sint16) +{ + std::vector<int16_t> input_data{-129, -126, -125, -124, -123, 124, 125, 126, 127, 131}; + + std::vector<float> ref_output_data{-64.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 65.5}; + + Tensor input_tensor(loco::DataType::S16, {2, 5}, {{0.5}, {0}}, ""); + + _memory_manager->allocate_memory(input_tensor); + input_tensor.writeData(input_data.data(), input_data.size() * sizeof(int16_t)); + + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + Dequantize kernel(&input_tensor, &output_tensor); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<float>(output_tensor), + ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 5})); +} + +TEST_F(DequantizeTest, InvalidInputType_NEG) +{ + std::vector<float> input_data{-129, -126, -125, -124, -123, 124, 125, 126, 127, 131}; + + Tensor input_tensor = + makeInputTensor<DataType::FLOAT32>({2, 5}, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + Dequantize kernel(&input_tensor, &output_tensor); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(DequantizeTest, InvalidOutputType_NEG) +{ + std::vector<int16_t> input_data{-129, -126, -125, -124, -123, 124, 125, 126, 127, 131}; + + Tensor input_tensor(loco::DataType::S16, {2, 5}, {{0.5}, {0}}, ""); + + _memory_manager->allocate_memory(input_tensor); + input_tensor.writeData(input_data.data(), input_data.size() * sizeof(int16_t)); + + Tensor output_tensor = makeOutputTensor(DataType::S8, /*scale*/ 0.5, /*zero_point*/ -1); + + Dequantize kernel(&input_tensor, &output_tensor); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(DequantizeTest, InvalidInputZeroPoint_NEG) +{ + std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + + Tensor input_tensor = + makeInputTensor<DataType::S16>({2, 5}, 0.5, -1, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + Dequantize kernel(&input_tensor, &output_tensor); + EXPECT_ANY_THROW(kernel.configure()); +} + +} // namespace +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/Div.cpp b/compiler/luci-interpreter/src/kernels/Div.cpp index 0e52ba1f0..dd1532278 100644 --- a/compiler/luci-interpreter/src/kernels/Div.cpp +++ b/compiler/luci-interpreter/src/kernels/Div.cpp @@ -46,6 +46,12 @@ void Div::execute() const case DataType::FLOAT32: evalFloat(); break; + case DataType::S64: + evalInteger<int64_t>(); + break; + case DataType::S32: + evalInteger<int32_t>(); + break; case DataType::U8: evalQuantized(); break; @@ -56,13 +62,9 @@ void Div::execute() const void Div::evalFloat() const { - float activation_min{}; - float activation_max{}; - calculateActivationRange(_params.activation, &activation_min, &activation_max); - tflite::ArithmeticParams params{}; - params.float_activation_min = activation_min; - params.float_activation_max = activation_max; + fillArithmeticActivationRange<float>(params, _params.activation); + const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes( getTensorShape(input1()), getTensorShape(input2()), ¶ms); @@ -80,6 +82,28 @@ void Div::evalFloat() const } } +template <typename T> void Div::evalInteger() const +{ + tflite::ArithmeticParams params{}; + fillArithmeticActivationRange<T>(params, _params.activation); + + const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes( + getTensorShape(input1()), getTensorShape(input2()), ¶ms); + + if (need_broadcast) + { + tflite::reference_ops::BroadcastDivSlow( + params, getTensorShape(input1()), getTensorData<T>(input1()), getTensorShape(input2()), + getTensorData<T>(input2()), getTensorShape(output()), getTensorData<T>(output())); + } + else + { + tflite::reference_ops::Div(params, getTensorShape(input1()), getTensorData<T>(input1()), + getTensorShape(input2()), getTensorData<T>(input2()), + getTensorShape(output()), getTensorData<T>(output())); + } +} + void Div::evalQuantized() const { const auto input1_scale = static_cast<double>(input1()->scale()); diff --git a/compiler/luci-interpreter/src/kernels/Div.h b/compiler/luci-interpreter/src/kernels/Div.h index 6040cdd02..c1bf3e10b 100644 --- a/compiler/luci-interpreter/src/kernels/Div.h +++ b/compiler/luci-interpreter/src/kernels/Div.h @@ -39,6 +39,7 @@ public: private: void evalFloat() const; + template <typename T> void evalInteger() const; void evalQuantized() const; }; diff --git a/compiler/luci-interpreter/src/kernels/Div.test.cpp b/compiler/luci-interpreter/src/kernels/Div.test.cpp index 021d68d06..85cd8b90a 100644 --- a/compiler/luci-interpreter/src/kernels/Div.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Div.test.cpp @@ -134,6 +134,56 @@ TEST_F(DivTest, Uint8) EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape)); } +template <loco::DataType DType> void checkInteger(luci_interpreter::IMemoryManager *memory_manager) +{ + using dtype = typename loco::DataTypeImpl<DType>::Type; + Shape base_shape = {2, 3, 1, 2}; + std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}}; + + std::vector<std::vector<dtype>> test_outputs = {{5, 6, 2, 0, 10, 3, // + 10, 0, 4, 5, 20, 0, // + 0, 0, 0, 2, 0, 0, // + 2, 0, 1, 10, 5, 0, // + 2, 3, 1, 0, 5, 1, // + 18, 20, 7, 0, 37, 10}, + {5, 6, 4, 5, 0, 0, 2, 0, 1, 0, 37, 10}, + {5, 7, 4, 6, 2, 3, 10, 0, 8, 0, 4, 0, + 0, 0, 0, 0, 0, 0, 0, 10, 5, 0, 1, 0, + 0, 0, 5, 9, 1, 1, 0, 0, 37, 50, 7, 10}, + {5, 7, 8, 0, 0, 0, 0, 10, 5, 9, 7, 10}}; + std::vector<dtype> input1_data{20, 30, 40, -17, -4, -7, 11, -31, 10, 19, 75, 100}; + std::vector<dtype> input2_data{4, 5, 10, -3, 2, 10}; + for (size_t i = 0; i < test_shapes.size(); ++i) + { + Tensor input1_tensor = makeInputTensor<DType>(base_shape, input1_data, memory_manager); + Tensor input2_tensor = makeInputTensor<DType>(test_shapes[i], input2_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DType); + + DivParams params{}; + params.activation = Activation::RELU; + + Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<dtype>(output_tensor), test_outputs[i]) + << "With shape number " << i; + } +} + +TEST_F(DivTest, SInt64) +{ + checkInteger<loco::DataType::S64>(_memory_manager.get()); + SUCCEED(); +} + +TEST_F(DivTest, SInt32) +{ + checkInteger<loco::DataType::S32>(_memory_manager.get()); + SUCCEED(); +} + TEST_F(DivTest, Input_Output_Type_NEG) { Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get()); @@ -149,9 +199,9 @@ TEST_F(DivTest, Input_Output_Type_NEG) TEST_F(DivTest, Invalid_Input_Type_NEG) { - Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get()); - Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2}, _memory_manager.get()); - Tensor output_tensor = makeOutputTensor(DataType::S64); + Tensor input1_tensor = makeInputTensor<DataType::U64>({1}, {1}, _memory_manager.get()); + Tensor input2_tensor = makeInputTensor<DataType::U64>({1}, {2}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::U64); DivParams params{}; params.activation = Activation::RELU; @@ -162,6 +212,19 @@ TEST_F(DivTest, Invalid_Input_Type_NEG) EXPECT_ANY_THROW(kernel.execute()); } +TEST_F(DivTest, Invalid_Output_Type_NEG) +{ + Tensor input1_tensor = makeInputTensor<DataType::S32>({1}, {1}, _memory_manager.get()); + Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {2}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S64); + + DivParams params{}; + params.activation = Activation::RELU; + + Div kernel(&input1_tensor, &input2_tensor, &output_tensor, params); + EXPECT_ANY_THROW(kernel.configure()); +} + } // namespace } // namespace kernels } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/Equal.cpp b/compiler/luci-interpreter/src/kernels/Equal.cpp index f58de1250..a57e127b7 100644 --- a/compiler/luci-interpreter/src/kernels/Equal.cpp +++ b/compiler/luci-interpreter/src/kernels/Equal.cpp @@ -49,6 +49,12 @@ void Equal::execute() const case DataType::FLOAT32: evalFloat(); break; + case DataType::S64: + evalInteger<int64_t>(); + break; + case DataType::S32: + evalInteger<int32_t>(); + break; case DataType::U8: evalQuantized(); break; @@ -79,6 +85,29 @@ void Equal::evalFloat() const } } +template <typename T> void Equal::evalInteger() const +{ + const auto x_data = getTensorData<T>(x()); + const auto y_data = getTensorData<T>(y()); + auto output_data = getTensorData<bool>(output()); + + tflite::ComparisonParams op_params; + op_params.is_broadcast = x()->shape() != y()->shape(); + + if (op_params.is_broadcast) + { + tflite::reference_ops::Broadcast4DSlowEqualNoScaling(op_params, getTensorShape(x()), x_data, + getTensorShape(y()), y_data, + getTensorShape(output()), output_data); + } + else + { + tflite::reference_ops::EqualNoScaling(op_params, getTensorShape(x()), x_data, + getTensorShape(y()), y_data, getTensorShape(output()), + output_data); + } +} + void Equal::evalQuantized() const { const auto x_data = getTensorData<uint8_t>(x()); diff --git a/compiler/luci-interpreter/src/kernels/Equal.h b/compiler/luci-interpreter/src/kernels/Equal.h index 11f025eac..c9be32cc0 100644 --- a/compiler/luci-interpreter/src/kernels/Equal.h +++ b/compiler/luci-interpreter/src/kernels/Equal.h @@ -38,6 +38,7 @@ public: private: void evalFloat() const; + template <typename T> void evalInteger() const; void evalQuantized() const; private: diff --git a/compiler/luci-interpreter/src/kernels/Equal.test.cpp b/compiler/luci-interpreter/src/kernels/Equal.test.cpp index 46a0f97d8..5870e5460 100644 --- a/compiler/luci-interpreter/src/kernels/Equal.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Equal.test.cpp @@ -99,6 +99,82 @@ TEST_F(EqualTest, FloatBroardcast) EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3})); } +template <loco::DataType DType> +void checkIntegerSimple(luci_interpreter::IMemoryManager *memory_manager) +{ + using dtype = typename loco::DataTypeImpl<DType>::Type; + dtype min_value = std::numeric_limits<dtype>::min(); + dtype max_value = std::numeric_limits<dtype>::max(); + std::vector<dtype> x_data{min_value, 2, max_value}; + + std::vector<dtype> y_data{min_value, -2, max_value}; + + std::vector<bool> ref_output_data{true, false, true}; + + Tensor x_tensor = makeInputTensor<DType>({3}, x_data, memory_manager); + Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + Equal kernel(&x_tensor, &y_tensor, &output_tensor); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3})); +} + +template <loco::DataType DType> +void checkIntegerBroadcast(luci_interpreter::IMemoryManager *memory_manager) +{ + using dtype = typename loco::DataTypeImpl<DType>::Type; + dtype min_value = std::numeric_limits<dtype>::min(); + dtype max_value = std::numeric_limits<dtype>::max(); + std::vector<dtype> x_data{ + min_value, 2, 3, // Row 1 + 4, 5, max_value, // Row 2 + -1, -2, -3, // Row 3 + min_value, -2, max_value, // Row 4 + }; + + std::vector<dtype> y_data{ + min_value, -2, max_value, // Row 1 + }; + + std::vector<bool> ref_output_data{ + true, false, false, // Row 1 + false, false, true, // Row 2 + false, true, false, // Row 3 + true, true, true, // Row 4 + }; + + Tensor x_tensor = makeInputTensor<DType>({4, 3}, x_data, memory_manager); + Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + Equal kernel(&x_tensor, &y_tensor, &output_tensor); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3})); +} + +TEST_F(EqualTest, Int32) +{ + checkIntegerSimple<loco::DataType::S32>(_memory_manager.get()); + checkIntegerBroadcast<loco::DataType::S32>(_memory_manager.get()); + SUCCEED(); +} + +TEST_F(EqualTest, Int64) +{ + checkIntegerSimple<loco::DataType::S64>(_memory_manager.get()); + checkIntegerBroadcast<loco::DataType::S64>(_memory_manager.get()); + SUCCEED(); +} + // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors. const float F_MIN = -128.0 / 128.0; const float F_MAX = 127.0 / 128.0; @@ -195,6 +271,36 @@ TEST_F(EqualTest, Input_Output_Type_NEG) EXPECT_ANY_THROW(kernel.configure()); } +TEST_F(EqualTest, Float_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2}, {1.f, 2.f}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({3}, {1.f, 2.f, 3.f}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + Equal kernel(&x_tensor, &y_tensor, &output_tensor); + ASSERT_ANY_THROW(kernel.configure()); +} + +TEST_F(EqualTest, Int32_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::S32>({2}, {1, 2}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::S32>({3}, {1, 2, 3}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + Equal kernel(&x_tensor, &y_tensor, &output_tensor); + ASSERT_ANY_THROW(kernel.configure()); +} + +TEST_F(EqualTest, Int64_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::S64>({2}, {1, 2}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::S64>({3}, {1, 2, 3}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + Equal kernel(&x_tensor, &y_tensor, &output_tensor); + ASSERT_ANY_THROW(kernel.configure()); +} + } // namespace } // namespace kernels } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/ExpandDims.cpp b/compiler/luci-interpreter/src/kernels/ExpandDims.cpp new file mode 100644 index 000000000..ba35c99fa --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/ExpandDims.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/ExpandDims.h" +#include "kernels/Utils.h" + +namespace luci_interpreter +{ +namespace kernels +{ + +ExpandDims::ExpandDims(const Tensor *input, const Tensor *axis, Tensor *output) + : Kernel({input, axis}, {output}) +{ +} + +void ExpandDims::configure() +{ + int32_t axis_value; + + switch (axis()->element_type()) + { + case loco::DataType::S32: + axis_value = *getTensorData<int32_t>(axis()); + break; + case loco::DataType::S64: + axis_value = static_cast<int32_t>(*getTensorData<int64_t>(axis())); + break; + default: + throw std::runtime_error("Unsupported type."); + } + + const auto input_shape = input()->shape(); + + if (axis_value < 0) + { + axis_value += input_shape.num_dims() + 1; + } + + LUCI_INTERPRETER_CHECK(axis_value <= input_shape.num_dims() and axis_value >= 0); + + Shape output_shape(input_shape.num_dims() + 1); + for (int32_t i = 0; i < output_shape.num_dims(); ++i) + { + if (i < axis_value) + { + output_shape.dim(i) = input_shape.dim(i); + } + else if (i == axis_value) + { + output_shape.dim(i) = 1; + } + else + { + LUCI_INTERPRETER_CHECK(i >= 1); + output_shape.dim(i) = input_shape.dim(i - 1); + } + } + + output()->resize(output_shape); +} + +void ExpandDims::execute() const +{ + // Just copy input to output + const auto *input_data = input()->data<void>(); + auto *output_data = output()->data<void>(); + + const size_t element_size = getDataTypeSize(input()->element_type()); + const int32_t num_elements = input()->shape().num_elements(); + std::memcpy(output_data, input_data, num_elements * element_size); +} + +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/ExpandDims.h b/compiler/luci-interpreter/src/kernels/ExpandDims.h new file mode 100644 index 000000000..e510b1160 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/ExpandDims.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_KERNELS_EXPAND_DIMS_H +#define LUCI_INTERPRETER_KERNELS_EXPAND_DIMS_H + +#include "core/Kernel.h" +#include "core/KernelParams.h" + +namespace luci_interpreter +{ +namespace kernels +{ + +class ExpandDims : public Kernel +{ +public: + ExpandDims(const Tensor *input, const Tensor *axis, Tensor *output); + + const Tensor *input() const { return _inputs[0]; } + const Tensor *axis() const { return _inputs[1]; } + Tensor *output() const { return _outputs[0]; } + + void configure() override; + void execute() const override; +}; + +} // namespace kernels +} // namespace luci_interpreter + +#endif // LUCI_INTERPRETER_KERNELS_EXPAND_DIMS_H diff --git a/compiler/luci-interpreter/src/kernels/ExpandDims.test.cpp b/compiler/luci-interpreter/src/kernels/ExpandDims.test.cpp new file mode 100644 index 000000000..df9eaccc0 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/ExpandDims.test.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/ExpandDims.h" +#include "kernels/TestUtils.h" +#include "luci_interpreter/TestMemoryManager.h" + +namespace luci_interpreter +{ +namespace kernels +{ +namespace +{ + +using namespace testing; + +class ExpandDimsTest : public ::testing::Test +{ +protected: + void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); } + + std::unique_ptr<IMemoryManager> _memory_manager; +}; + +TEST_F(ExpandDimsTest, PositiveAxis) +{ + std::vector<int32_t> input_data{-1, 1, -2, 2}; + std::initializer_list<int32_t> input_shape = {2, 2}; + + std::initializer_list<int32_t> axis_value = {0}; + + Tensor input_tensor = + makeInputTensor<DataType::S32>(input_shape, input_data, _memory_manager.get()); + Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_value, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S32); + + ExpandDims kernel(&input_tensor, &axis_tensor, &output_tensor); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<int32_t>(output_tensor), ::testing::ElementsAreArray(input_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 2})); +} + +TEST_F(ExpandDimsTest, NegAxis) +{ + std::vector<int32_t> input_data{-1, 1, -2, 2}; + std::initializer_list<int32_t> input_shape = {2, 2}; + + std::initializer_list<int32_t> axis_value = {-1}; + + Tensor input_tensor = + makeInputTensor<DataType::S32>(input_shape, input_data, _memory_manager.get()); + Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_value, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S32); + + ExpandDims kernel(&input_tensor, &axis_tensor, &output_tensor); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<int32_t>(output_tensor), ::testing::ElementsAreArray(input_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 2, 1})); +} + +TEST_F(ExpandDimsTest, InvalidAxisType_NEG) +{ + std::vector<int32_t> input_data{-1, 1, -2, 2}; + std::initializer_list<int32_t> input_shape = {2, 2}; + + std::initializer_list<float> axis_value = {1.0}; + + Tensor input_tensor = + makeInputTensor<DataType::S32>(input_shape, input_data, _memory_manager.get()); + Tensor axis_tensor = makeInputTensor<DataType::FLOAT32>({1}, axis_value, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S32); + + ExpandDims kernel(&input_tensor, &axis_tensor, &output_tensor); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(ExpandDimsTest, InvalidAxisValue_NEG) +{ + std::vector<int32_t> input_data{-1, 1, -2, 2}; + std::initializer_list<int32_t> input_shape = {2, 2}; + + std::initializer_list<int32_t> axis_value = {3}; + + Tensor input_tensor = + makeInputTensor<DataType::S32>(input_shape, input_data, _memory_manager.get()); + Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_value, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S32); + + ExpandDims kernel(&input_tensor, &axis_tensor, &output_tensor); + EXPECT_ANY_THROW(kernel.configure()); +} + +} // namespace +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/FullyConnected.cpp b/compiler/luci-interpreter/src/kernels/FullyConnected.cpp index cfe8f8bf2..bd2bb2f35 100644 --- a/compiler/luci-interpreter/src/kernels/FullyConnected.cpp +++ b/compiler/luci-interpreter/src/kernels/FullyConnected.cpp @@ -18,8 +18,7 @@ #include "kernels/Utils.h" -#include <tensorflow/lite/kernels/internal/reference/fully_connected.h> -#include <tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h> +#include "PALFullyConnected.h" #include <stdexcept> @@ -74,7 +73,18 @@ void FullyConnected::configure() if (bias()) LUCI_INTERPRETER_CHECK(bias()->shape().num_elements() == weights()->shape().dim(0)); - output()->resize({batch_size, num_units}); + if (params().keep_num_dims == false) + { + output()->resize({batch_size, num_units}); + } + else + { + luci_interpreter::Shape output_shape(input_shape.num_dims()); + for (int i = 0; i < input_shape.num_dims(); ++i) + output_shape.dim(i) = input_shape.dim(i); + output_shape.dim(input_shape.num_dims() - 1) = num_units; + output()->resize(output_shape); + } } void FullyConnected::execute() const @@ -172,7 +182,7 @@ void FullyConnected::evalQuantizedS8() const op_params.quantized_activation_max = output_activation_max; op_params.lhs_cacheable = false; op_params.rhs_cacheable = false; - tflite::reference_integer_ops::FullyConnected( + luci_interpreter_pal::FullyConnected<int8_t>( op_params, getTensorShape(input()), getTensorData<int8_t>(input()), getTensorShape(weights()), getTensorData<int8_t>(weights()), getTensorShape(bias()), getTensorData<int32_t>(bias()), getTensorShape(output()), getTensorData<int8_t>(output())); diff --git a/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp b/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp index b0eda0145..4474cc4fb 100644 --- a/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp +++ b/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp @@ -133,7 +133,7 @@ template <typename T> class FullyConnectedTest : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t, int8_t>; -TYPED_TEST_CASE(FullyConnectedTest, DataTypes); +TYPED_TEST_SUITE(FullyConnectedTest, DataTypes); TYPED_TEST(FullyConnectedTest, Simple) { diff --git a/compiler/luci-interpreter/src/kernels/Gather.cpp b/compiler/luci-interpreter/src/kernels/Gather.cpp new file mode 100644 index 000000000..f1256660f --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/Gather.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2021 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/Gather.h" +#include "kernels/Utils.h" +#include "PALGather.h" + +#include <stdexcept> +#include <cassert> + +namespace luci_interpreter +{ + +namespace kernels +{ + +Gather::Gather(const Tensor *params, const Tensor *indices, Tensor *output, + const GatherParams &gparams) + : KernelWithParams<GatherParams>({params, indices}, {output}, gparams) +{ +} + +void Gather::configure() +{ + if (params()->element_type() == DataType::FLOAT32) + { + LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::FLOAT32); + } + else + { + throw std::runtime_error("Unsupported type."); + } + + LUCI_INTERPRETER_CHECK(indices()->element_type() == DataType::S32 || + indices()->element_type() == DataType::S64); + + // refer tensorflow/lite/kernels/gather.cc + + const Shape ¶ms_shape = params()->shape(); + const Shape &indices_shape = indices()->shape(); + + int axis = _params.axis; + if (axis < 0) + { + axis += params_shape.num_dims(); + } + LUCI_INTERPRETER_CHECK(0 <= axis && axis < params_shape.num_dims()); + + int batch_dims = _params.batch_dims; + // batch_dims should be in range: [-rank(indices), rank(indices)]. + // Negative batch_dims is added with rank of positions. + if (batch_dims < 0) + { + batch_dims += indices_shape.num_dims(); + } + LUCI_INTERPRETER_CHECK(batch_dims <= axis); + LUCI_INTERPRETER_CHECK(0 <= batch_dims && batch_dims < params_shape.num_dims()); + LUCI_INTERPRETER_CHECK(batch_dims <= indices_shape.num_dims()); + for (int i = 0; i < batch_dims; ++i) + { + LUCI_INTERPRETER_CHECK(params_shape.dim(i) == indices_shape.dim(i)); + } + + const int num_dimensions = params_shape.num_dims() + indices_shape.num_dims() - 1 - batch_dims; + + Shape output_shape(num_dimensions); + int output_index = 0; + for (int i = 0; i < axis; ++i) + { + output_shape.dim(output_index++) = params_shape.dim(i); + } + for (int i = batch_dims; i < indices_shape.num_dims(); ++i) + { + output_shape.dim(output_index++) = indices_shape.dim(i); + } + for (int i = axis + 1; i < params_shape.num_dims(); ++i) + { + output_shape.dim(output_index++) = params_shape.dim(i); + } + output()->resize(output_shape); +} + +void Gather::execute() const +{ + switch (params()->element_type()) + { + case DataType::FLOAT32: + evalFloat(); + break; + default: + throw std::runtime_error("Unsupported type."); + } +} + +void Gather::evalFloat() const +{ + assert(indices()->element_type() == DataType::S32 || indices()->element_type() == DataType::S64); + + const auto params_data = getTensorData<float>(params()); + auto output_data = getTensorData<float>(output()); + + tflite::GatherParams tparams; + tparams.axis = _params.axis; + tparams.batch_dims = _params.batch_dims; + + if (indices()->element_type() == DataType::S32) + { + const auto indices_data = getTensorData<int32_t>(indices()); + + luci_interpreter_pal::Gather<float, int32_t>(tparams, getTensorShape(params()), params_data, + getTensorShape(indices()), indices_data, + getTensorShape(output()), output_data); + } + else + { + const auto indices_data = getTensorData<int64_t>(indices()); + + luci_interpreter_pal::Gather<float, int64_t>(tparams, getTensorShape(params()), params_data, + getTensorShape(indices()), indices_data, + getTensorShape(output()), output_data); + } +} + +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/Gather.h b/compiler/luci-interpreter/src/kernels/Gather.h new file mode 100644 index 000000000..cc02d64fb --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/Gather.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_KERNELS_GATHER_H +#define LUCI_INTERPRETER_KERNELS_GATHER_H + +#include "core/Kernel.h" +#include "core/KernelParams.h" + +namespace luci_interpreter +{ +namespace kernels +{ + +class Gather : public KernelWithParams<GatherParams> +{ +public: + Gather(const Tensor *params, const Tensor *indices, Tensor *output, const GatherParams &gparams); + + const Tensor *params() const { return _inputs[0]; } + const Tensor *indices() const { return _inputs[1]; } + Tensor *output() const { return _outputs[0]; } + + void configure() override; + void execute() const override; + +private: + void evalFloat() const; +}; + +} // namespace kernels +} // namespace luci_interpreter + +#endif // LUCI_INTERPRETER_KERNELS_GATHER_H diff --git a/compiler/luci-interpreter/src/kernels/Gather.test.cpp b/compiler/luci-interpreter/src/kernels/Gather.test.cpp new file mode 100644 index 000000000..4b3dda708 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/Gather.test.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/Gather.h" +#include "kernels/TestUtils.h" +#include "luci_interpreter/TestMemoryManager.h" + +namespace luci_interpreter +{ +namespace kernels +{ +namespace +{ + +using namespace testing; + +class GatherTest : public ::testing::Test +{ +protected: + void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); } + + std::unique_ptr<IMemoryManager> _memory_manager; +}; + +TEST_F(GatherTest, Simple) +{ + std::vector<float> params_data{1.f, 2.f, 3.f, 4.f, 5.f, 6.f}; + std::vector<int32_t> indices_data{1, 0, 1, 5}; + std::vector<float> ref_output_data{2.f, 1.f, 2.f, 6.f}; + + Tensor params_tensor = + makeInputTensor<DataType::FLOAT32>({1, 6}, params_data, _memory_manager.get()); + Tensor indices_tensor = makeInputTensor<DataType::S32>({4}, indices_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + GatherParams gparams; + + gparams.axis = 1; + gparams.batch_dims = 0; + + Gather kernel(¶ms_tensor, &indices_tensor, &output_tensor, gparams); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<float>(output_tensor), + ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 4})); +} + +TEST_F(GatherTest, Simple_Batch) +{ + Shape params_shape = {3, 5}; + Shape indices_shape = {3, 2}; + std::vector<float> params_data{0., 0., 1., 0., 2., 3., 0., 0., 0., 4., 0., 5., 0., 6., 0.}; + std::vector<int32_t> indices_data{2, 4, 0, 4, 1, 3}; + std::vector<float> ref_output_data{1., 2., 3., 4., 5., 6.}; + + Tensor params_tensor = + makeInputTensor<DataType::FLOAT32>(params_shape, params_data, _memory_manager.get()); + Tensor indices_tensor = + makeInputTensor<DataType::S32>(indices_shape, indices_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + GatherParams gparams; + + gparams.axis = 1; + gparams.batch_dims = 1; + + Gather kernel(¶ms_tensor, &indices_tensor, &output_tensor, gparams); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<float>(output_tensor), + ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3, 2})); +} + +TEST_F(GatherTest, Simple_NEG) +{ + Tensor params_tensor = makeInputTensor<DataType::S32>({1}, {1}, _memory_manager.get()); + Tensor indices_tensor = makeInputTensor<DataType::S32>({1}, {0}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + GatherParams gparams; + + Gather kernel(¶ms_tensor, &indices_tensor, &output_tensor, gparams); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(GatherTest, Axis_NEG) +{ + Tensor params_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get()); + Tensor indices_tensor = makeInputTensor<DataType::S32>({1}, {0}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + GatherParams gparams; + + gparams.axis = 100; + gparams.batch_dims = 0; + + Gather kernel(¶ms_tensor, &indices_tensor, &output_tensor, gparams); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(GatherTest, Batch_NEG) +{ + std::vector<float> params_data{1.f, 2.f, 3.f, 4.f, 5.f, 6.f}; + std::vector<int32_t> indices_data{1, 0, 1, 5}; + std::vector<float> ref_output_data{2.f, 1.f, 2.f, 6.f}; + + Tensor params_tensor = + makeInputTensor<DataType::FLOAT32>({1, 6}, params_data, _memory_manager.get()); + Tensor indices_tensor = makeInputTensor<DataType::S32>({4}, indices_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + GatherParams gparams; + + gparams.axis = 0; + gparams.batch_dims = 1; + + Gather kernel(¶ms_tensor, &indices_tensor, &output_tensor, gparams); + EXPECT_ANY_THROW(kernel.configure()); +} + +} // namespace +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/Greater.cpp b/compiler/luci-interpreter/src/kernels/Greater.cpp index f0dd2db36..5ccae3c38 100644 --- a/compiler/luci-interpreter/src/kernels/Greater.cpp +++ b/compiler/luci-interpreter/src/kernels/Greater.cpp @@ -49,6 +49,12 @@ void Greater::execute() const case DataType::FLOAT32: evalFloat(); break; + case DataType::S64: + evalInteger<int64_t>(); + break; + case DataType::S32: + evalInteger<int32_t>(); + break; case DataType::U8: evalQuantized(); break; @@ -79,6 +85,29 @@ void Greater::evalFloat() const } } +template <typename T> void Greater::evalInteger() const +{ + const auto x_data = getTensorData<T>(x()); + const auto y_data = getTensorData<T>(y()); + auto output_data = getTensorData<bool>(output()); + + tflite::ComparisonParams op_params; + op_params.is_broadcast = x()->shape() != y()->shape(); + + if (op_params.is_broadcast) + { + tflite::reference_ops::Broadcast4DSlowGreaterNoScaling(op_params, getTensorShape(x()), x_data, + getTensorShape(y()), y_data, + getTensorShape(output()), output_data); + } + else + { + tflite::reference_ops::GreaterNoScaling(op_params, getTensorShape(x()), x_data, + getTensorShape(y()), y_data, getTensorShape(output()), + output_data); + } +} + void Greater::evalQuantized() const { const auto x_data = getTensorData<uint8_t>(x()); diff --git a/compiler/luci-interpreter/src/kernels/Greater.h b/compiler/luci-interpreter/src/kernels/Greater.h index 877c139c9..065f76d7b 100644 --- a/compiler/luci-interpreter/src/kernels/Greater.h +++ b/compiler/luci-interpreter/src/kernels/Greater.h @@ -38,6 +38,7 @@ public: private: void evalFloat() const; + template <typename T> void evalInteger() const; void evalQuantized() const; private: diff --git a/compiler/luci-interpreter/src/kernels/Greater.test.cpp b/compiler/luci-interpreter/src/kernels/Greater.test.cpp index ba3925f17..a48080124 100644 --- a/compiler/luci-interpreter/src/kernels/Greater.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Greater.test.cpp @@ -97,6 +97,82 @@ TEST_F(GreaterTest, FloatBroardcast) EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3, 3})); } +template <loco::DataType DType> +void checkIntegerSimple(luci_interpreter::IMemoryManager *memory_manager) +{ + using dtype = typename loco::DataTypeImpl<DType>::Type; + dtype min_value = std::numeric_limits<dtype>::min(); + dtype max_value = std::numeric_limits<dtype>::max(); + std::vector<dtype> x_data{min_value, 2, max_value}; + + std::vector<dtype> y_data{min_value + 1, -2, max_value}; + + std::vector<bool> ref_output_data{false, true, false}; + + Tensor x_tensor = makeInputTensor<DType>({3}, x_data, memory_manager); + Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + Greater kernel(&x_tensor, &y_tensor, &output_tensor); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3})); +} + +template <loco::DataType DType> +void checkIntegerBroadcast(luci_interpreter::IMemoryManager *memory_manager) +{ + using dtype = typename loco::DataTypeImpl<DType>::Type; + dtype min_value = std::numeric_limits<dtype>::min(); + dtype max_value = std::numeric_limits<dtype>::max(); + std::vector<dtype> x_data{ + min_value, 2, 3, // Row 1 + 4, 5, max_value, // Row 2 + -1, -4, -3, // Row 3 + min_value, -2, max_value, // Row 4 + }; + + std::vector<dtype> y_data{ + min_value + 1, -2, max_value - 1, // Row 1 + }; + + std::vector<bool> ref_output_data{ + false, true, false, // Row 1 + true, true, true, // Row 2 + true, false, false, // Row 3 + false, false, true, // Row 4 + }; + + Tensor x_tensor = makeInputTensor<DType>({4, 3}, x_data, memory_manager); + Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + Greater kernel(&x_tensor, &y_tensor, &output_tensor); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3})); +} + +TEST_F(GreaterTest, Int32) +{ + checkIntegerSimple<loco::DataType::S32>(_memory_manager.get()); + checkIntegerBroadcast<loco::DataType::S32>(_memory_manager.get()); + SUCCEED(); +} + +TEST_F(GreaterTest, Int64) +{ + checkIntegerSimple<loco::DataType::S64>(_memory_manager.get()); + checkIntegerBroadcast<loco::DataType::S64>(_memory_manager.get()); + SUCCEED(); +} + // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors. const float F_MIN = -128.0 / 128.0; const float F_MAX = 127.0 / 128.0; @@ -223,6 +299,36 @@ TEST_F(GreaterTest, Input_Output_Type_NEG) EXPECT_ANY_THROW(kernel.configure()); } +TEST_F(GreaterTest, Float_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2}, {1.f, 2.f}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({3}, {1.f, 2.f, 3.f}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + Greater kernel(&x_tensor, &y_tensor, &output_tensor); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(GreaterTest, Int32_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::S32>({2}, {1, 2}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::S32>({3}, {1, 2, 3}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + Greater kernel(&x_tensor, &y_tensor, &output_tensor); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(GreaterTest, Int64_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::S64>({2}, {1, 2}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::S64>({3}, {1, 2, 3}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + Greater kernel(&x_tensor, &y_tensor, &output_tensor); + EXPECT_ANY_THROW(kernel.configure()); +} + } // namespace } // namespace kernels } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/GreaterEqual.cpp b/compiler/luci-interpreter/src/kernels/GreaterEqual.cpp index e7c1b4afe..27e42c971 100644 --- a/compiler/luci-interpreter/src/kernels/GreaterEqual.cpp +++ b/compiler/luci-interpreter/src/kernels/GreaterEqual.cpp @@ -52,6 +52,12 @@ void GreaterEqual::execute() const case DataType::FLOAT32: evalFloat(); break; + case DataType::S64: + evalInteger<int64_t>(); + break; + case DataType::S32: + evalInteger<int32_t>(); + break; case DataType::U8: evalQuantized(); break; @@ -82,6 +88,29 @@ void GreaterEqual::evalFloat() const } } +template <typename T> void GreaterEqual::evalInteger() const +{ + const auto x_data = getTensorData<T>(x()); + const auto y_data = getTensorData<T>(y()); + auto output_data = getTensorData<bool>(output()); + + tflite::ComparisonParams op_params; + op_params.is_broadcast = x()->shape() != y()->shape(); + + if (op_params.is_broadcast) + { + tflite::reference_ops::Broadcast4DSlowGreaterEqualNoScaling( + op_params, getTensorShape(x()), x_data, getTensorShape(y()), y_data, getTensorShape(output()), + output_data); + } + else + { + tflite::reference_ops::GreaterEqualNoScaling(op_params, getTensorShape(x()), x_data, + getTensorShape(y()), y_data, + getTensorShape(output()), output_data); + } +} + void GreaterEqual::evalQuantized() const { const auto x_data = getTensorData<uint8_t>(x()); diff --git a/compiler/luci-interpreter/src/kernels/GreaterEqual.h b/compiler/luci-interpreter/src/kernels/GreaterEqual.h index 4a0f48748..e333c30a6 100644 --- a/compiler/luci-interpreter/src/kernels/GreaterEqual.h +++ b/compiler/luci-interpreter/src/kernels/GreaterEqual.h @@ -38,6 +38,7 @@ public: private: void evalFloat() const; + template <typename T> void evalInteger() const; void evalQuantized() const; private: diff --git a/compiler/luci-interpreter/src/kernels/GreaterEqual.test.cpp b/compiler/luci-interpreter/src/kernels/GreaterEqual.test.cpp index a9d172301..35bf88eab 100644 --- a/compiler/luci-interpreter/src/kernels/GreaterEqual.test.cpp +++ b/compiler/luci-interpreter/src/kernels/GreaterEqual.test.cpp @@ -96,6 +96,81 @@ TEST_F(GreaterEqualTest, FloatBroardcast) EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data)); EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3, 3})); } +template <loco::DataType DType> +void checkIntegerSimple(luci_interpreter::IMemoryManager *memory_manager) +{ + using dtype = typename loco::DataTypeImpl<DType>::Type; + dtype min_value = std::numeric_limits<dtype>::min(); + dtype max_value = std::numeric_limits<dtype>::max(); + std::vector<dtype> x_data{min_value, 2, max_value}; + + std::vector<dtype> y_data{min_value + 1, -2, max_value}; + + std::vector<bool> ref_output_data{false, true, true}; + + Tensor x_tensor = makeInputTensor<DType>({3}, x_data, memory_manager); + Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3})); +} + +template <loco::DataType DType> +void checkIntegerBroadcast(luci_interpreter::IMemoryManager *memory_manager) +{ + using dtype = typename loco::DataTypeImpl<DType>::Type; + dtype min_value = std::numeric_limits<dtype>::min(); + dtype max_value = std::numeric_limits<dtype>::max(); + std::vector<dtype> x_data{ + min_value, 2, 3, // Row 1 + 4, 5, max_value, // Row 2 + -1, -4, -3, // Row 3 + min_value, -2, max_value - 1, // Row 4 + }; + + std::vector<dtype> y_data{ + min_value + 1, -2, max_value - 1, // Row 1 + }; + + std::vector<bool> ref_output_data{ + false, true, false, // Row 1 + true, true, true, // Row 2 + true, false, false, // Row 3 + false, true, true, // Row 4 + }; + + Tensor x_tensor = makeInputTensor<DType>({4, 3}, x_data, memory_manager); + Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3})); +} + +TEST_F(GreaterEqualTest, Int32) +{ + checkIntegerSimple<loco::DataType::S32>(_memory_manager.get()); + checkIntegerBroadcast<loco::DataType::S32>(_memory_manager.get()); + SUCCEED(); +} + +TEST_F(GreaterEqualTest, Int64) +{ + checkIntegerSimple<loco::DataType::S64>(_memory_manager.get()); + checkIntegerBroadcast<loco::DataType::S64>(_memory_manager.get()); + SUCCEED(); +} // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors. const float F_MIN = -128.0 / 128.0; @@ -223,6 +298,36 @@ TEST_F(GreaterEqualTest, Input_Output_Type_NEG) EXPECT_ANY_THROW(kernel.configure()); } +TEST_F(GreaterEqualTest, Float_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2}, {1.f, 2.f}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({3}, {1.f, 2.f, 3.f}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(GreaterEqualTest, Int32_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::S32>({2}, {1, 2}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::S32>({3}, {1, 2, 3}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(GreaterEqualTest, Int64_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::S64>({2}, {1, 2}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::S64>({3}, {1, 2, 3}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor); + EXPECT_ANY_THROW(kernel.configure()); +} + } // namespace } // namespace kernels } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp index 1e565e358..6f960e8b4 100644 --- a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp +++ b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp @@ -81,7 +81,7 @@ template <typename T> class L2NormalizeTest : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t>; -TYPED_TEST_CASE(L2NormalizeTest, DataTypes); +TYPED_TEST_SUITE(L2NormalizeTest, DataTypes); TYPED_TEST(L2NormalizeTest, Simple) { diff --git a/compiler/luci-interpreter/src/kernels/L2Pool2D.test.cpp b/compiler/luci-interpreter/src/kernels/L2Pool2D.test.cpp index 289742a50..7245456cb 100644 --- a/compiler/luci-interpreter/src/kernels/L2Pool2D.test.cpp +++ b/compiler/luci-interpreter/src/kernels/L2Pool2D.test.cpp @@ -206,7 +206,8 @@ TEST_F(L2Pool2DTest, FloatPaddingSameStride) kernel.execute(); std::vector<float> ref_output_data{3.5, 6.0, 6.5, 5.70088, 2.54951, 7.2111, 8.63134, 7.0}; - EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data)); + // NOTE with NEON+ruy, error is #1=-1.14441e-05, #6=-1.81198e-05 + EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data, 1.0e-4f)); // TODO make a Shape checking of output_tensor. } diff --git a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp index 6ec8a348a..0f6263b57 100644 --- a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp +++ b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp @@ -83,7 +83,7 @@ template <typename T> class LeakReluTest : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t>; -TYPED_TEST_CASE(LeakReluTest, DataTypes); +TYPED_TEST_SUITE(LeakReluTest, DataTypes); TYPED_TEST(LeakReluTest, Simple) { diff --git a/compiler/luci-interpreter/src/kernels/Less.cpp b/compiler/luci-interpreter/src/kernels/Less.cpp index 041444926..8d26ff297 100644 --- a/compiler/luci-interpreter/src/kernels/Less.cpp +++ b/compiler/luci-interpreter/src/kernels/Less.cpp @@ -49,6 +49,12 @@ void Less::execute() const case DataType::FLOAT32: evalFloat(); break; + case DataType::S64: + evalInteger<int64_t>(); + break; + case DataType::S32: + evalInteger<int32_t>(); + break; case DataType::U8: evalQuantized(); break; @@ -79,6 +85,29 @@ void Less::evalFloat() const } } +template <typename T> void Less::evalInteger() const +{ + const auto x_data = getTensorData<T>(x()); + const auto y_data = getTensorData<T>(y()); + auto output_data = getTensorData<bool>(output()); + + tflite::ComparisonParams op_params; + op_params.is_broadcast = x()->shape() != y()->shape(); + + if (op_params.is_broadcast) + { + tflite::reference_ops::Broadcast4DSlowLessNoScaling(op_params, getTensorShape(x()), x_data, + getTensorShape(y()), y_data, + getTensorShape(output()), output_data); + } + else + { + tflite::reference_ops::LessNoScaling(op_params, getTensorShape(x()), x_data, + getTensorShape(y()), y_data, getTensorShape(output()), + output_data); + } +} + void Less::evalQuantized() const { const auto x_data = getTensorData<uint8_t>(x()); diff --git a/compiler/luci-interpreter/src/kernels/Less.h b/compiler/luci-interpreter/src/kernels/Less.h index 293740e72..e27bb689c 100644 --- a/compiler/luci-interpreter/src/kernels/Less.h +++ b/compiler/luci-interpreter/src/kernels/Less.h @@ -38,6 +38,7 @@ public: private: void evalFloat() const; + template <typename T> void evalInteger() const; void evalQuantized() const; private: diff --git a/compiler/luci-interpreter/src/kernels/Less.test.cpp b/compiler/luci-interpreter/src/kernels/Less.test.cpp index e9d09b288..8c5963363 100644 --- a/compiler/luci-interpreter/src/kernels/Less.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Less.test.cpp @@ -97,6 +97,82 @@ TEST_F(LessTest, FloatBroardcast) EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3, 3})); } +template <loco::DataType DType> +void checkIntegerSimple(luci_interpreter::IMemoryManager *memory_manager) +{ + using dtype = typename loco::DataTypeImpl<DType>::Type; + dtype min_value = std::numeric_limits<dtype>::min(); + dtype max_value = std::numeric_limits<dtype>::max(); + std::vector<dtype> x_data{min_value, 2, max_value}; + + std::vector<dtype> y_data{min_value + 1, -2, max_value}; + + std::vector<bool> ref_output_data{true, false, false}; + + Tensor x_tensor = makeInputTensor<DType>({3}, x_data, memory_manager); + Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + Less kernel(&x_tensor, &y_tensor, &output_tensor); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3})); +} + +template <loco::DataType DType> +void checkIntegerBroadcast(luci_interpreter::IMemoryManager *memory_manager) +{ + using dtype = typename loco::DataTypeImpl<DType>::Type; + dtype min_value = std::numeric_limits<dtype>::min(); + dtype max_value = std::numeric_limits<dtype>::max(); + std::vector<dtype> x_data{ + min_value, 2, 3, // Row 1 + 4, 5, max_value, // Row 2 + -1, -4, -3, // Row 3 + min_value, -2, max_value, // Row 4 + }; + + std::vector<dtype> y_data{ + min_value + 1, -2, max_value - 1, // Row 1 + }; + + std::vector<bool> ref_output_data{ + true, false, true, // Row 1 + false, false, false, // Row 2 + false, true, true, // Row 3 + true, false, false, // Row 4 + }; + + Tensor x_tensor = makeInputTensor<DType>({4, 3}, x_data, memory_manager); + Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + Less kernel(&x_tensor, &y_tensor, &output_tensor); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3})); +} + +TEST_F(LessTest, Int32) +{ + checkIntegerSimple<loco::DataType::S32>(_memory_manager.get()); + checkIntegerBroadcast<loco::DataType::S32>(_memory_manager.get()); + SUCCEED(); +} + +TEST_F(LessTest, Int64) +{ + checkIntegerSimple<loco::DataType::S64>(_memory_manager.get()); + checkIntegerBroadcast<loco::DataType::S64>(_memory_manager.get()); + SUCCEED(); +} + // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors. const float F_MIN = -128.0 / 128.0; const float F_MAX = 127.0 / 128.0; @@ -223,6 +299,36 @@ TEST_F(LessTest, Input_Output_Type_NEG) EXPECT_ANY_THROW(kernel.configure()); } +TEST_F(LessTest, Float_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2}, {1.f, 2.f}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({3}, {1.f, 2.f, 3.f}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + Less kernel(&x_tensor, &y_tensor, &output_tensor); + ASSERT_ANY_THROW(kernel.configure()); +} + +TEST_F(LessTest, Int32_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::S32>({2}, {1, 2}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::S32>({3}, {1, 2, 3}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + Less kernel(&x_tensor, &y_tensor, &output_tensor); + ASSERT_ANY_THROW(kernel.configure()); +} + +TEST_F(LessTest, Int64_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::S64>({2}, {1, 2}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::S64>({3}, {1, 2, 3}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + Less kernel(&x_tensor, &y_tensor, &output_tensor); + ASSERT_ANY_THROW(kernel.configure()); +} + } // namespace } // namespace kernels } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/LessEqual.cpp b/compiler/luci-interpreter/src/kernels/LessEqual.cpp index 5f4c7f7aa..b474bc47a 100644 --- a/compiler/luci-interpreter/src/kernels/LessEqual.cpp +++ b/compiler/luci-interpreter/src/kernels/LessEqual.cpp @@ -49,6 +49,12 @@ void LessEqual::execute() const case DataType::FLOAT32: evalFloat(); break; + case DataType::S64: + evalInteger<int64_t>(); + break; + case DataType::S32: + evalInteger<int32_t>(); + break; case DataType::U8: evalQuantized(); break; @@ -79,6 +85,29 @@ void LessEqual::evalFloat() const } } +template <typename T> void LessEqual::evalInteger() const +{ + const auto x_data = getTensorData<T>(x()); + const auto y_data = getTensorData<T>(y()); + auto output_data = getTensorData<bool>(output()); + + tflite::ComparisonParams op_params; + op_params.is_broadcast = x()->shape() != y()->shape(); + + if (op_params.is_broadcast) + { + tflite::reference_ops::Broadcast4DSlowLessEqualNoScaling(op_params, getTensorShape(x()), x_data, + getTensorShape(y()), y_data, + getTensorShape(output()), output_data); + } + else + { + tflite::reference_ops::LessEqualNoScaling(op_params, getTensorShape(x()), x_data, + getTensorShape(y()), y_data, getTensorShape(output()), + output_data); + } +} + void LessEqual::evalQuantized() const { const auto x_data = getTensorData<uint8_t>(x()); diff --git a/compiler/luci-interpreter/src/kernels/LessEqual.h b/compiler/luci-interpreter/src/kernels/LessEqual.h index b6da1a2a8..f82ea90d4 100644 --- a/compiler/luci-interpreter/src/kernels/LessEqual.h +++ b/compiler/luci-interpreter/src/kernels/LessEqual.h @@ -38,6 +38,7 @@ public: private: void evalFloat() const; + template <typename T> void evalInteger() const; void evalQuantized() const; private: diff --git a/compiler/luci-interpreter/src/kernels/LessEqual.test.cpp b/compiler/luci-interpreter/src/kernels/LessEqual.test.cpp index 0558003dd..b2e2fa7a1 100644 --- a/compiler/luci-interpreter/src/kernels/LessEqual.test.cpp +++ b/compiler/luci-interpreter/src/kernels/LessEqual.test.cpp @@ -97,6 +97,82 @@ TEST_F(LessEqualTest, FloatBroardcast) EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3, 3})); } +template <loco::DataType DType> +void checkIntegerSimple(luci_interpreter::IMemoryManager *memory_manager) +{ + using dtype = typename loco::DataTypeImpl<DType>::Type; + dtype min_value = std::numeric_limits<dtype>::min(); + dtype max_value = std::numeric_limits<dtype>::max(); + std::vector<dtype> x_data{min_value, 2, max_value}; + + std::vector<dtype> y_data{min_value + 1, -2, max_value}; + + std::vector<bool> ref_output_data{true, false, true}; + + Tensor x_tensor = makeInputTensor<DType>({3}, x_data, memory_manager); + Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + LessEqual kernel(&x_tensor, &y_tensor, &output_tensor); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3})); +} + +template <loco::DataType DType> +void checkIntegerBroadcast(luci_interpreter::IMemoryManager *memory_manager) +{ + using dtype = typename loco::DataTypeImpl<DType>::Type; + dtype min_value = std::numeric_limits<dtype>::min(); + dtype max_value = std::numeric_limits<dtype>::max(); + std::vector<dtype> x_data{ + min_value, 2, 3, // Row 1 + 4, 5, max_value, // Row 2 + -1, -4, -3, // Row 3 + min_value, -2, max_value, // Row 4 + }; + + std::vector<dtype> y_data{ + min_value + 1, -2, max_value - 1, // Row 1 + }; + + std::vector<bool> ref_output_data{ + true, false, true, // Row 1 + false, false, false, // Row 2 + false, true, true, // Row 3 + true, true, false, // Row 4 + }; + + Tensor x_tensor = makeInputTensor<DType>({4, 3}, x_data, memory_manager); + Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + LessEqual kernel(&x_tensor, &y_tensor, &output_tensor); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3})); +} + +TEST_F(LessEqualTest, Int32) +{ + checkIntegerSimple<loco::DataType::S32>(_memory_manager.get()); + checkIntegerBroadcast<loco::DataType::S32>(_memory_manager.get()); + SUCCEED(); +} + +TEST_F(LessEqualTest, Int64) +{ + checkIntegerSimple<loco::DataType::S64>(_memory_manager.get()); + checkIntegerBroadcast<loco::DataType::S64>(_memory_manager.get()); + SUCCEED(); +} + // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors. const float F_MIN = -128.0 / 128.0; const float F_MAX = 127.0 / 128.0; @@ -223,6 +299,36 @@ TEST_F(LessEqualTest, Input_Output_Type_NEG) EXPECT_ANY_THROW(kernel.configure()); } +TEST_F(LessEqualTest, Float_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2}, {1.f, 2.f}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({3}, {1.f, 2.f, 3.f}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + LessEqual kernel(&x_tensor, &y_tensor, &output_tensor); + ASSERT_ANY_THROW(kernel.configure()); +} + +TEST_F(LessEqualTest, Int32_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::S32>({2}, {1, 2}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::S32>({3}, {1, 2, 3}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + LessEqual kernel(&x_tensor, &y_tensor, &output_tensor); + ASSERT_ANY_THROW(kernel.configure()); +} + +TEST_F(LessEqualTest, Int64_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::S64>({2}, {1, 2}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::S64>({3}, {1, 2, 3}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + LessEqual kernel(&x_tensor, &y_tensor, &output_tensor); + ASSERT_ANY_THROW(kernel.configure()); +} + } // namespace } // namespace kernels } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp index 70227563f..5a1ea669c 100644 --- a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp @@ -76,7 +76,7 @@ template <typename T> class LogisticTest : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t>; -TYPED_TEST_CASE(LogisticTest, DataTypes); +TYPED_TEST_SUITE(LogisticTest, DataTypes); TYPED_TEST(LogisticTest, Simple) { diff --git a/compiler/luci-interpreter/src/kernels/MirrorPad.cpp b/compiler/luci-interpreter/src/kernels/MirrorPad.cpp index 89049c96c..2fbeefce4 100644 --- a/compiler/luci-interpreter/src/kernels/MirrorPad.cpp +++ b/compiler/luci-interpreter/src/kernels/MirrorPad.cpp @@ -1,5 +1,6 @@ /* * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,8 +19,6 @@ #include "kernels/Utils.h" -#include <tensorflow/lite/kernels/internal/reference/pad.h> - namespace luci_interpreter { namespace kernels @@ -59,44 +58,25 @@ void MirrorPad::configure() output()->resize(output_shape); } +template <typename T> +inline void MirrorPadImpl(const Tensor &input, const Tensor &paddings, MirrorPadMode mode, + Tensor &output); + void MirrorPad::execute() const { - const int num_dims = input()->shape().num_dims(); - - tflite::PadParams params{}; - params.left_padding_count = num_dims; - params.right_padding_count = num_dims; - - const auto *paddings_data = getTensorData<int32_t>(paddings()); - for (int i = num_dims - 1; i >= 0; --i) - { - params.left_padding[i] = paddings_data[i * 2]; - params.right_padding[i] = paddings_data[i * 2 + 1]; - } - switch (input()->element_type()) { case DataType::FLOAT32: { - const float pad_value = 0; - - // NOTE: this implementation only obtains min-max values for quantization - // TODO: calculate proper inference values - tflite::reference_ops::Pad(params, getTensorShape(input()), getTensorData<float>(input()), - &pad_value, getTensorShape(output()), - getTensorData<float>(output())); + MirrorPadImpl<float>(*input(), *paddings(), params().mode, *output()); break; } case DataType::U8: { - // NOTE: this implementation only obtains min-max values for quantization - // TODO: calculate proper inference values assert(output()->zero_point() >= std::numeric_limits<uint8_t>::min()); assert(output()->zero_point() <= std::numeric_limits<uint8_t>::max()); - const auto pad_value = static_cast<uint8_t>(output()->zero_point()); - tflite::reference_ops::Pad(params, getTensorShape(input()), getTensorData<uint8_t>(input()), - &pad_value, getTensorShape(output()), - getTensorData<uint8_t>(output())); + + MirrorPadImpl<uint8_t>(*input(), *paddings(), params().mode, *output()); break; } default: @@ -104,5 +84,87 @@ void MirrorPad::execute() const } } +template <typename T> +inline void MirrorPadImpl(const Tensor &input, const Tensor &paddings, MirrorPadMode mode, + Tensor &output) +{ + auto const input_dims = input.shape().num_dims(); + auto const input_data = input.data<T>(); + auto const paddings_data = paddings.data<int32_t>(); + auto const output_data = output.data<T>(); + + auto const input_b = input_dims > 3 ? input.shape().dim(input_dims - 4) : 1; + auto const input_h = input_dims > 2 ? input.shape().dim(input_dims - 3) : 1; + auto const input_w = input_dims > 1 ? input.shape().dim(input_dims - 2) : 1; + auto const input_d = input.shape().dim(input_dims - 1); + + auto const input_h_offset = input_d * input_w; + auto const input_b_offset = input_h_offset * input_h; + + auto const output_b = input_dims > 3 ? output.shape().dim(input_dims - 4) : 1; + auto const output_h = input_dims > 2 ? output.shape().dim(input_dims - 3) : 1; + auto const output_w = input_dims > 1 ? output.shape().dim(input_dims - 2) : 1; + auto const output_d = output.shape().dim(input_dims - 1); + + auto const left_b_pad = paddings_data[2 * (input_dims - 4)]; + auto const left_h_pad = paddings_data[2 * (input_dims - 3)]; + auto const left_w_pad = paddings_data[2 * (input_dims - 2)]; + auto const left_d_pad = paddings_data[2 * (input_dims - 1)]; + + auto const right_b_pad = paddings_data[2 * (input_dims - 4) + 1]; + auto const right_h_pad = paddings_data[2 * (input_dims - 3) + 1]; + auto const right_w_pad = paddings_data[2 * (input_dims - 2) + 1]; + auto const right_d_pad = paddings_data[2 * (input_dims - 1) + 1]; + + const auto positive_mod = [](auto a, auto b) { return (a % b + b) % b; }; + const auto offset_index = [input_d, input_h_offset, input_b_offset](auto d, auto w, auto h, + auto b) { + return d + w * input_d + h * input_h_offset + b * input_b_offset; + }; + + const auto symmetric_dim = [&positive_mod](auto i, auto left_pad, auto input) { + bool reflected = (((i < left_pad ? i + 1 - input : i) - left_pad) / input & 1) == 1; + return positive_mod(reflected ? input + left_pad - i - 1 : i - left_pad, input); + }; + + const T *in_ptr = input_data; + T *out_ptr = output_data; + + for (int32_t b = 0; b < output_b; ++b) + { + for (int32_t h = 0; h < output_h; ++h) + { + for (int32_t w = 0; w < output_w; ++w) + { + for (int32_t d = 0; d < output_d; ++d) + { + if (b < left_b_pad || b >= output_b - right_b_pad || // + h < left_h_pad || h >= output_h - right_h_pad || // + w < left_w_pad || w >= output_w - right_w_pad || // + d < left_d_pad || d >= output_d - right_d_pad) + { + if (mode == MirrorPadMode::REFLECT) + { + *out_ptr++ = input_data[offset_index( + positive_mod(d - left_d_pad, input_d), positive_mod(w - left_w_pad, input_w), + positive_mod(h - left_h_pad, input_h), positive_mod(b - left_b_pad, input_b))]; + } + else + { + *out_ptr++ = input_data[offset_index( + symmetric_dim(d, left_d_pad, input_d), symmetric_dim(w, left_w_pad, input_w), + symmetric_dim(h, left_h_pad, input_h), symmetric_dim(b, left_b_pad, input_b))]; + } + } + else + { + *out_ptr++ = *in_ptr++; + } + } + } + } + } +} + } // namespace kernels } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/MirrorPad.test.cpp b/compiler/luci-interpreter/src/kernels/MirrorPad.test.cpp index de9da5051..740d8cb22 100644 --- a/compiler/luci-interpreter/src/kernels/MirrorPad.test.cpp +++ b/compiler/luci-interpreter/src/kernels/MirrorPad.test.cpp @@ -14,4 +14,212 @@ * limitations under the License. */ -// TODO: Add tests for MirrorPad +#include "kernels/MirrorPad.h" +#include "kernels/TestUtils.h" +#include "luci_interpreter/TestMemoryManager.h" + +namespace luci_interpreter +{ +namespace kernels +{ +namespace +{ + +using namespace testing; + +class MirrorPadTest : public ::testing::Test +{ +protected: + void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); } + + void Execute(const Tensor &input, const Tensor &padding, Tensor &output, MirrorPadMode mode) + { + MirrorPadParams params{}; + params.mode = mode; + + MirrorPad kernel(&input, &padding, &output, params); + kernel.configure(); + _memory_manager->allocate_memory(output); + kernel.execute(); + } + + std::unique_ptr<IMemoryManager> _memory_manager; +}; + +TEST_F(MirrorPadTest, FloatReflect) +{ + Shape input_shape = {1, 2, 2, 1}; + Shape padding_shape = {4, 2}; + + std::vector<float> input_data{1.0f, 2.0f, // + 3.0f, 4.0f}; // + std::vector<int> padding_data{0, 0, 2, 1, 1, 2, 0, 0}; + + Tensor input_tensor = + makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get()); + Tensor padding_tensor = + makeInputTensor<DataType::S32>(padding_shape, padding_data, _memory_manager.get()); + + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + Execute(input_tensor, padding_tensor, output_tensor, MirrorPadMode::REFLECT); + + std::vector<float> ref_output_data{2.0f, 1.0f, 2.0f, 1.0f, 2.0f, // + 4.0f, 3.0f, 4.0f, 3.0f, 4.0f, // + 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, // + 4.0f, 3.0f, 4.0f, 3.0f, 4.0f, // + 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}; // + std::initializer_list<int32_t> ref_output_shape{1, 5, 5, 1}; + + EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape)); +} + +TEST_F(MirrorPadTest, FloatSymmetric) +{ + Shape input_shape = {1, 2, 2, 1}; + Shape padding_shape = {4, 2}; + + std::vector<float> input_data{1.0f, 2.0f, // + 3.0f, 4.0f}; // + std::vector<int> padding_data{0, 0, 2, 1, 1, 2, 0, 0}; + + Tensor input_tensor = + makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get()); + Tensor padding_tensor = + makeInputTensor<DataType::S32>(padding_shape, padding_data, _memory_manager.get()); + + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + Execute(input_tensor, padding_tensor, output_tensor, MirrorPadMode::SYMMETRIC); + + std::vector<float> ref_output_data{3.0, 3.0, 4.0, 4.0, 3.0, // + 1.0, 1.0, 2.0, 2.0, 1.0, // + 1.0, 1.0, 2.0, 2.0, 1.0, // + 3.0, 3.0, 4.0, 4.0, 3.0, // + 3.0, 3.0, 4.0, 4.0, 3.0}; // + std::initializer_list<int32_t> ref_output_shape{1, 5, 5, 1}; + + EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape)); +} + +TEST_F(MirrorPadTest, FloatSymmetric2Dim) +{ + Shape input_shape = {3, 1}; + Shape padding_shape = {2, 2}; + + std::vector<float> input_data{1.0f, 2.0f, 3.0f}; + std::vector<int> padding_data{1, 2, 0, 0}; + + Tensor input_tensor = + makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get()); + Tensor padding_tensor = + makeInputTensor<DataType::S32>(padding_shape, padding_data, _memory_manager.get()); + + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + Execute(input_tensor, padding_tensor, output_tensor, MirrorPadMode::SYMMETRIC); + + std::vector<float> ref_output_data{1.0, 1.0, 2.0, 3.0, 3.0, 2.0}; + std::initializer_list<int32_t> ref_output_shape{6, 1}; + + EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape)); +} + +TEST_F(MirrorPadTest, Uint8Reflect) +{ + Shape input_shape = {1, 2, 3, 1}; + Shape padding_shape = {4, 2}; + + float quant_tolerance = getTolerance(0.0f, 6.0f, 255); + std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(0.0f, 6.0f); + + std::vector<float> input_data{1.0f, 2.0f, 3.0f, // + 4.0f, 5.0f, 6.0f}; // + std::vector<int> padding_data{0, 0, 2, 1, 1, 3, 0, 0}; + + Tensor input_tensor = makeInputTensor<DataType::U8>( + input_shape, quant_param.first, quant_param.second, input_data, _memory_manager.get()); + + Tensor padding_tensor = + makeInputTensor<DataType::S32>(padding_shape, padding_data, _memory_manager.get()); + + Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second); + + Execute(input_tensor, padding_tensor, output_tensor, MirrorPadMode::REFLECT); + + std::vector<float> ref_output_data{ + 3.0f, 1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, // + 6.0f, 4.0f, 5.0f, 6.0f, 4.0f, 5.0f, 6.0f, // + 3.0f, 1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, // + 6.0f, 4.0f, 5.0f, 6.0f, 4.0f, 5.0f, 6.0f, // + 3.0f, 1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, // + }; + std::initializer_list<int32_t> ref_output_shape{1, 5, 7, 1}; + + EXPECT_THAT(dequantizeTensorData(output_tensor), + FloatArrayNear(ref_output_data, quant_tolerance)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape)); +} + +TEST_F(MirrorPadTest, Uint8Symmetric) +{ + Shape input_shape = {1, 2, 3, 1}; + Shape padding_shape = {4, 2}; + + float quant_tolerance = getTolerance(0.0f, 6.0f, 255); + std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(0.0f, 6.0f); + + std::vector<float> input_data{1.0f, 2.0f, 3.0f, // + 4.0f, 5.0f, 6.0f}; // + std::vector<int> padding_data{0, 0, 2, 1, 1, 3, 0, 0}; + + Tensor input_tensor = makeInputTensor<DataType::U8>( + input_shape, quant_param.first, quant_param.second, input_data, _memory_manager.get()); + + Tensor padding_tensor = + makeInputTensor<DataType::S32>(padding_shape, padding_data, _memory_manager.get()); + + Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second); + + Execute(input_tensor, padding_tensor, output_tensor, MirrorPadMode::SYMMETRIC); + + std::vector<float> ref_output_data{ + 4.0f, 4.0f, 5.0f, 6.0f, 6.0f, 5.0f, 4.0f, // + 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 2.0f, 1.0f, // + 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 2.0f, 1.0f, // + 4.0f, 4.0f, 5.0f, 6.0f, 6.0f, 5.0f, 4.0f, // + 4.0f, 4.0f, 5.0f, 6.0f, 6.0f, 5.0f, 4.0f, // + }; + std::initializer_list<int32_t> ref_output_shape{1, 5, 7, 1}; + + EXPECT_THAT(dequantizeTensorData(output_tensor), + FloatArrayNear(ref_output_data, quant_tolerance)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape)); +} + +TEST_F(MirrorPadTest, UnsupportedDim_NEG) +{ + Tensor input_tensor = + makeInputTensor<DataType::FLOAT32>({1, 1, 1, 1, 1}, {1.0f}, _memory_manager.get()); + Tensor padding_tensor = + makeInputTensor<DataType::S32>({5, 2}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + EXPECT_ANY_THROW(Execute(input_tensor, padding_tensor, output_tensor, MirrorPadMode::REFLECT)); +} + +TEST_F(MirrorPadTest, InvalidInputType_NEG) +{ + Tensor input_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get()); + Tensor padding_tensor = makeInputTensor<DataType::S32>({1, 2}, {0, 0}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S64); + + EXPECT_ANY_THROW(Execute(input_tensor, padding_tensor, output_tensor, MirrorPadMode::REFLECT)); +} + +} // namespace +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/Mul.cpp b/compiler/luci-interpreter/src/kernels/Mul.cpp index bc855de0f..531fb4fa1 100644 --- a/compiler/luci-interpreter/src/kernels/Mul.cpp +++ b/compiler/luci-interpreter/src/kernels/Mul.cpp @@ -42,6 +42,8 @@ void Mul::configure() LUCI_INTERPRETER_CHECK(output()->element_type() == input1()->element_type()); if (input1()->element_type() == DataType::S16) { + LUCI_INTERPRETER_CHECK(input1()->zero_points().size() == 1 && + input2()->zero_points().size() == 1) LUCI_INTERPRETER_CHECK(input1()->zero_point() == 0 && input2()->zero_point() == 0 && output()->zero_point() == 0); } @@ -56,6 +58,12 @@ void Mul::execute() const case DataType::FLOAT32: evalFloat(); break; + case DataType::S64: + evalInteger<int64_t>(); + break; + case DataType::S32: + evalInteger<int32_t>(); + break; case DataType::S16: evalQuantizedS16(); break; @@ -66,13 +74,8 @@ void Mul::execute() const void Mul::evalFloat() const { - float activation_min{}; - float activation_max{}; - calculateActivationRange(_params.activation, &activation_min, &activation_max); - tflite::ArithmeticParams params{}; - params.float_activation_min = activation_min; - params.float_activation_max = activation_max; + fillArithmeticActivationRange<float>(params, _params.activation); const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes( getTensorShape(input1()), getTensorShape(input2()), ¶ms); @@ -91,6 +94,28 @@ void Mul::evalFloat() const } } +template <typename T> void Mul::evalInteger() const +{ + tflite::ArithmeticParams params{}; + fillArithmeticActivationRange<T>(params, _params.activation); + + const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes( + getTensorShape(input1()), getTensorShape(input2()), ¶ms); + + if (need_broadcast) + { + luci_interpreter_pal::BroadcastMul4DSlow( + params, getTensorShape(input1()), getTensorData<T>(input1()), getTensorShape(input2()), + getTensorData<T>(input2()), getTensorShape(output()), getTensorData<T>(output())); + } + else + { + luci_interpreter_pal::Mul(params, getTensorShape(input1()), getTensorData<T>(input1()), + getTensorShape(input2()), getTensorData<T>(input2()), + getTensorShape(output()), getTensorData<T>(output())); + } +} + void Mul::evalQuantizedS16() const { const auto input1_scale = static_cast<double>(input1()->scale()); diff --git a/compiler/luci-interpreter/src/kernels/Mul.h b/compiler/luci-interpreter/src/kernels/Mul.h index 2ccf60f3a..c0cf817df 100644 --- a/compiler/luci-interpreter/src/kernels/Mul.h +++ b/compiler/luci-interpreter/src/kernels/Mul.h @@ -42,6 +42,7 @@ public: private: void evalFloat() const; + template <typename T> void evalInteger() const; void evalQuantizedS16() const; }; diff --git a/compiler/luci-interpreter/src/kernels/Mul.test.cpp b/compiler/luci-interpreter/src/kernels/Mul.test.cpp index 471f6ac86..fc0e60614 100644 --- a/compiler/luci-interpreter/src/kernels/Mul.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Mul.test.cpp @@ -93,6 +93,78 @@ TEST_F(MulTest, Float) } } +template <loco::DataType DType> void checkInteger(luci_interpreter::IMemoryManager *memory_manager) +{ + using dtype = typename loco::DataTypeImpl<DType>::Type; + Shape base_shape = {2, 3, 1, 2}; + std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}}; + + dtype max_value = std::numeric_limits<dtype>::max(); + dtype res_max = max_value - max_value % 10; + + std::vector<std::vector<dtype>> test_outputs = { + {8, 0, 20, 0, 4, 30, // + 16, 0, 40, 3, 8, 0, // + 0, 0, 0, 6, 0, 0, // + 4, 0, 10, 9, 2, 0, // + 40, 0, 100, 0, 20, 150, // + 28, 0, 70, 0, 14, res_max}, + {8, 0, 40, 3, 0, 0, 4, 0, 100, 0, 14, res_max}, + {8, 12, 0, 0, 20, 30, 16, 0, 0, 0, 40, 0, 0, 0, 0, 0, 0, + 0, 0, 9, 2, 0, 10, 0, 0, 0, 20, 30, 100, 150, 0, 0, 14, max_value / 10 * 2, + 70, res_max}, + {8, 12, 0, 0, 0, 0, 0, 9, 20, 30, 70, res_max}}; + std::vector<dtype> input1_data{2, 3, 4, -1, -3, -2, 1, -3, 10, 15, 7, max_value / 10}; + std::vector<dtype> input2_data{4, 0, 10, -3, 2, 10}; + for (size_t i = 0; i < test_shapes.size(); ++i) + { + Tensor input1_tensor = makeInputTensor<DType>(base_shape, input1_data, memory_manager); + Tensor input2_tensor = makeInputTensor<DType>(test_shapes[i], input2_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DType); + + MulParams params{}; + params.activation = Activation::RELU; + + Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<dtype>(output_tensor), test_outputs[i]) + << "With shape number " << i; + } + // Re-run with exchanged inputs. + for (size_t i = 0; i < test_shapes.size(); ++i) + { + Tensor input1_tensor = makeInputTensor<DType>(test_shapes[i], input2_data, memory_manager); + Tensor input2_tensor = makeInputTensor<DType>(base_shape, input1_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DType); + + MulParams params{}; + params.activation = Activation::RELU; + + Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<dtype>(output_tensor), test_outputs[i]) + << "With shape number " << i; + } +} + +TEST_F(MulTest, SInt64) +{ + checkInteger<loco::DataType::S64>(_memory_manager.get()); + SUCCEED(); +} + +TEST_F(MulTest, SInt32) +{ + checkInteger<loco::DataType::S32>(_memory_manager.get()); + SUCCEED(); +} + TEST_F(MulTest, SInt16) { Shape base_shape = {2, 3, 1, 2}; @@ -161,6 +233,60 @@ TEST_F(MulTest, SInt16) } } +TEST_F(MulTest, Input_Output_Type_NEG) +{ + Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get()); + Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {2}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + MulParams params{}; + params.activation = Activation::RELU; + + Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(MulTest, Invalid_Output_Type_NEG) +{ + Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get()); + Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S32); + + MulParams params{}; + params.activation = Activation::RELU; + + Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(MulTest, Invalid_Input_Type_NEG) +{ + Tensor input1_tensor = makeInputTensor<DataType::U64>({1}, {1}, _memory_manager.get()); + Tensor input2_tensor = makeInputTensor<DataType::U64>({1}, {2}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::U64); + + MulParams params{}; + params.activation = Activation::RELU; + + Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + EXPECT_ANY_THROW(kernel.execute()); +} + +TEST_F(MulTest, Invalid_Quantization_NEG) +{ + Tensor input1_tensor = makeInputTensor<DataType::S16>({1}, {1}, _memory_manager.get()); + Tensor input2_tensor = makeInputTensor<DataType::S16>({1}, {2}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S16); + + MulParams params{}; + params.activation = Activation::NONE; + + Mul kernel(&input1_tensor, &input2_tensor, &output_tensor, params); + EXPECT_ANY_THROW(kernel.configure()); +} + } // namespace } // namespace kernels } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/NotEqual.cpp b/compiler/luci-interpreter/src/kernels/NotEqual.cpp index 99d5e0fa0..54e5eee34 100644 --- a/compiler/luci-interpreter/src/kernels/NotEqual.cpp +++ b/compiler/luci-interpreter/src/kernels/NotEqual.cpp @@ -49,6 +49,12 @@ void NotEqual::execute() const case DataType::FLOAT32: evalFloat(); break; + case DataType::S64: + evalInteger<int64_t>(); + break; + case DataType::S32: + evalInteger<int32_t>(); + break; case DataType::U8: evalQuantized(); break; @@ -79,6 +85,29 @@ void NotEqual::evalFloat() const } } +template <typename T> void NotEqual::evalInteger() const +{ + const auto x_data = getTensorData<T>(x()); + const auto y_data = getTensorData<T>(y()); + auto output_data = getTensorData<bool>(output()); + + tflite::ComparisonParams op_params; + op_params.is_broadcast = x()->shape() != y()->shape(); + + if (op_params.is_broadcast) + { + tflite::reference_ops::Broadcast4DSlowNotEqualNoScaling(op_params, getTensorShape(x()), x_data, + getTensorShape(y()), y_data, + getTensorShape(output()), output_data); + } + else + { + tflite::reference_ops::NotEqualNoScaling(op_params, getTensorShape(x()), x_data, + getTensorShape(y()), y_data, getTensorShape(output()), + output_data); + } +} + void NotEqual::evalQuantized() const { const auto x_data = getTensorData<uint8_t>(x()); diff --git a/compiler/luci-interpreter/src/kernels/NotEqual.h b/compiler/luci-interpreter/src/kernels/NotEqual.h index 247874df7..d2aafe893 100644 --- a/compiler/luci-interpreter/src/kernels/NotEqual.h +++ b/compiler/luci-interpreter/src/kernels/NotEqual.h @@ -38,6 +38,7 @@ public: private: void evalFloat() const; + template <typename T> void evalInteger() const; void evalQuantized() const; private: diff --git a/compiler/luci-interpreter/src/kernels/NotEqual.test.cpp b/compiler/luci-interpreter/src/kernels/NotEqual.test.cpp index 763f86893..45bf4022a 100644 --- a/compiler/luci-interpreter/src/kernels/NotEqual.test.cpp +++ b/compiler/luci-interpreter/src/kernels/NotEqual.test.cpp @@ -99,6 +99,82 @@ TEST_F(NotEqualTest, FloatBroardcast) EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3})); } +template <loco::DataType DType> +void checkIntegerSimple(luci_interpreter::IMemoryManager *memory_manager) +{ + using dtype = typename loco::DataTypeImpl<DType>::Type; + dtype min_value = std::numeric_limits<dtype>::min(); + dtype max_value = std::numeric_limits<dtype>::max(); + std::vector<dtype> x_data{min_value, 2, max_value}; + + std::vector<dtype> y_data{min_value, -2, max_value}; + + std::vector<bool> ref_output_data{false, true, false}; + + Tensor x_tensor = makeInputTensor<DType>({3}, x_data, memory_manager); + Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + NotEqual kernel(&x_tensor, &y_tensor, &output_tensor); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({3})); +} + +template <loco::DataType DType> +void checkIntegerBroadcast(luci_interpreter::IMemoryManager *memory_manager) +{ + using dtype = typename loco::DataTypeImpl<DType>::Type; + dtype min_value = std::numeric_limits<dtype>::min(); + dtype max_value = std::numeric_limits<dtype>::max(); + std::vector<dtype> x_data{ + min_value, 2, 3, // Row 1 + 4, 5, max_value, // Row 2 + -1, -2, -3, // Row 3 + min_value, -2, max_value, // Row 4 + }; + + std::vector<dtype> y_data{ + min_value, -2, max_value, // Row 1 + }; + + std::vector<bool> ref_output_data{ + false, true, true, // Row 1 + true, true, false, // Row 2 + true, false, true, // Row 3 + false, false, false, // Row 4 + }; + + Tensor x_tensor = makeInputTensor<DType>({4, 3}, x_data, memory_manager); + Tensor y_tensor = makeInputTensor<DType>({3}, y_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + NotEqual kernel(&x_tensor, &y_tensor, &output_tensor); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<bool>(output_tensor), ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({4, 3})); +} + +TEST_F(NotEqualTest, Int32) +{ + checkIntegerSimple<loco::DataType::S32>(_memory_manager.get()); + checkIntegerBroadcast<loco::DataType::S32>(_memory_manager.get()); + SUCCEED(); +} + +TEST_F(NotEqualTest, Int64) +{ + checkIntegerSimple<loco::DataType::S64>(_memory_manager.get()); + checkIntegerBroadcast<loco::DataType::S64>(_memory_manager.get()); + SUCCEED(); +} + // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors. const float F_MIN = -128.0 / 128.0; const float F_MAX = 127.0 / 128.0; @@ -195,6 +271,36 @@ TEST_F(NotEqualTest, Input_Output_Type_NEG) EXPECT_ANY_THROW(kernel.configure()); } +TEST_F(NotEqualTest, Float_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2}, {1.f, 2.f}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::FLOAT32>({3}, {1.f, 2.f, 3.f}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + NotEqual kernel(&x_tensor, &y_tensor, &output_tensor); + ASSERT_ANY_THROW(kernel.configure()); +} + +TEST_F(NotEqualTest, Int32_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::S32>({2}, {1, 2}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::S32>({3}, {1, 2, 3}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + NotEqual kernel(&x_tensor, &y_tensor, &output_tensor); + ASSERT_ANY_THROW(kernel.configure()); +} + +TEST_F(NotEqualTest, Int64_Broadcast_NEG) +{ + Tensor x_tensor = makeInputTensor<DataType::S64>({2}, {1, 2}, _memory_manager.get()); + Tensor y_tensor = makeInputTensor<DataType::S64>({3}, {1, 2, 3}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::BOOL); + + NotEqual kernel(&x_tensor, &y_tensor, &output_tensor); + ASSERT_ANY_THROW(kernel.configure()); +} + } // namespace } // namespace kernels } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/OneHot.cpp b/compiler/luci-interpreter/src/kernels/OneHot.cpp new file mode 100644 index 000000000..4d3e5f2ef --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/OneHot.cpp @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/OneHot.h" +#include "kernels/Utils.h" + +namespace luci_interpreter +{ +namespace kernels +{ + +namespace +{ + +template <typename T> +void OneHotComputeImpl(const Tensor *indices_tensor, const Tensor *on_value_tensor, + const Tensor *off_value_tensor, int32_t depth, int32_t axis, + Tensor *output_tensor) +{ + // define input shape and correct axis + auto const &input_shape = indices_tensor->shape(); + axis = axis == -1 ? input_shape.num_dims() : axis; + + // TODO support other integer input types + auto const *indices = getTensorData<int32_t>(indices_tensor); + auto const on_value = getTensorData<T>(on_value_tensor)[0]; + auto const off_value = getTensorData<T>(off_value_tensor)[0]; + auto *output = getTensorData<T>(output_tensor); + + // prefix_dim_size == # of elements before the axis + // depth == # of elements per axis + // suffix_dim_size == # of elements after the axis + auto prefix_dim_size = 1; + for (int32_t i = 0; i < axis; ++i) + { + prefix_dim_size *= input_shape.dim(i); + } + assert(prefix_dim_size > 0); + auto const suffix_dim_size = input_shape.num_elements() / prefix_dim_size; + + // View the indices as a matrix of size: + // prefix_dim_size x suffix_dim_size + // View the output as a matrix of size: + // prefix_dim_size x depth x suffix_dim_size + // Then the output is: + // output(i, j, k) == (indices(i, k) == j) ? on : off + for (int32_t i = 0; i < prefix_dim_size; ++i) + for (int32_t j = 0; j < depth; ++j) + for (int32_t k = 0; k < suffix_dim_size; ++k, ++output) + *output = indices[i * suffix_dim_size + k] == j ? on_value : off_value; +} + +} // namespace + +OneHot::OneHot(const Tensor *indices, const Tensor *depth, const Tensor *on_value, + const Tensor *off_value, Tensor *output, const OneHotParams ¶ms) + : KernelWithParams<OneHotParams>({indices, depth, on_value, off_value}, {output}, params) +{ + // Do nothing +} + +void OneHot::configure() +{ + // check types + LUCI_INTERPRETER_CHECK(indices()->element_type() == DataType::S32); + LUCI_INTERPRETER_CHECK(depth()->element_type() == DataType::S32); + LUCI_INTERPRETER_CHECK(on_value()->element_type() == off_value()->element_type()); + LUCI_INTERPRETER_CHECK(output()->element_type() == on_value()->element_type()); + + // check shape dependent parameters + LUCI_INTERPRETER_CHECK(on_value()->shape().num_elements() == 1); + LUCI_INTERPRETER_CHECK(off_value()->shape().num_elements() == 1); + LUCI_INTERPRETER_CHECK(depth()->shape().num_elements() == 1); + LUCI_INTERPRETER_CHECK(params().axis >= -1 && params().axis <= indices()->shape().num_dims()); + + // define parameters that affect the output shape + auto const depth_value = getTensorData<int32_t>(depth())[0]; + auto const &input_shape = indices()->shape(); + auto const input_dims = input_shape.num_dims(); + auto const axis = params().axis == -1 ? input_dims : params().axis; + + // define output shape + Shape output_shape(input_shape.num_dims() + 1); + { + for (int32_t d = 0; d < axis; ++d) + output_shape.dim(d) = input_shape.dim(d); + + output_shape.dim(axis) = depth_value; + + for (int32_t d = axis + 1; d < output_shape.num_dims(); ++d) + output_shape.dim(d) = input_shape.dim(d - 1); + } + + // reshape output + output()->resize(output_shape); +} + +void OneHot::execute() const +{ + auto const depth_value = getTensorData<int32_t>(depth())[0]; + auto const axis = params().axis; + + switch (output()->element_type()) + { + case loco::DataType::FLOAT32: + OneHotComputeImpl<float>(indices(), on_value(), off_value(), depth_value, axis, output()); + break; + case loco::DataType::U8: + OneHotComputeImpl<uint8_t>(indices(), on_value(), off_value(), depth_value, axis, output()); + break; + case loco::DataType::S16: + OneHotComputeImpl<int16_t>(indices(), on_value(), off_value(), depth_value, axis, output()); + break; + default: + // TODO Support other data types + throw std::runtime_error("Not supported, yet!"); + break; + } +} + +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/OneHot.h b/compiler/luci-interpreter/src/kernels/OneHot.h new file mode 100644 index 000000000..572f857ae --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/OneHot.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_KERNELS_ONEHOT_H +#define LUCI_INTERPRETER_KERNELS_ONEHOT_H + +#include "core/Kernel.h" +#include "core/KernelParams.h" + +namespace luci_interpreter +{ +namespace kernels +{ + +class OneHot : public KernelWithParams<OneHotParams> +{ +public: + OneHot(const Tensor *indices, const Tensor *depth, const Tensor *on_value, + const Tensor *off_value, Tensor *output, const OneHotParams ¶ms); + + const Tensor *indices() const { return _inputs[0]; } + const Tensor *depth() const { return _inputs[1]; } + const Tensor *on_value() const { return _inputs[2]; } + const Tensor *off_value() const { return _inputs[3]; } + + Tensor *output() const { return _outputs[0]; } + + void configure() override; + void execute() const override; +}; + +} // namespace kernels +} // namespace luci_interpreter + +#endif // LUCI_INTERPRETER_KERNELS_ONEHOT_H diff --git a/compiler/luci-interpreter/src/kernels/OneHot.test.cpp b/compiler/luci-interpreter/src/kernels/OneHot.test.cpp new file mode 100644 index 000000000..45b6968fa --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/OneHot.test.cpp @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/OneHot.h" +#include "kernels/TestUtils.h" +#include "luci_interpreter/TestMemoryManager.h" + +namespace luci_interpreter +{ +namespace kernels +{ +namespace +{ + +using namespace testing; + +template <typename T1, typename T2> +void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape, + std::initializer_list<T1> input_data, std::initializer_list<int32_t> depth_data, + std::initializer_list<T2> on_value_data, std::initializer_list<T2> off_value_data, + int32_t axis, std::initializer_list<T2> output_data) +{ + std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>(); + + constexpr auto input_type = getElementType<T1>(); + constexpr auto output_type = getElementType<T2>(); + + Tensor input_tensor = makeInputTensor<input_type>(input_shape, input_data, memory_manager.get()); + Tensor depth_tensor = makeInputTensor<DataType::S32>({}, depth_data, memory_manager.get()); + Tensor on_value_tensor = makeInputTensor<output_type>({}, on_value_data, memory_manager.get()); + Tensor off_value_tensor = makeInputTensor<output_type>({}, off_value_data, memory_manager.get()); + Tensor output_tensor = makeOutputTensor(output_type); + + OneHotParams params{}; + params.axis = axis; + + OneHot kernel(&input_tensor, &depth_tensor, &on_value_tensor, &off_value_tensor, &output_tensor, + params); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorShape(output_tensor), output_shape); + EXPECT_THAT(extractTensorData<T2>(output_tensor), ::testing::ElementsAreArray(output_data)); +} + +template <typename T> class OneHotTest : public ::testing::Test +{ +}; + +using DataTypes = ::testing::Types<float, uint8_t, int16_t>; +TYPED_TEST_SUITE(OneHotTest, DataTypes); + +TYPED_TEST(OneHotTest, BasicPattern) +{ + // axis 0 + Check<int32_t, TypeParam>(/*input_shape=*/{2, 3}, /*output_shape=*/{4, 2, 3}, + /*input_data=*/ + { + 0, 3, 5, // + 7, 3, 0, // + }, + /*depth_data=*/{4}, /*on_value_data=*/{1}, /*off_value_data=*/{0}, + /*axis=*/0, + /*output_data=*/ + { + 1, 0, 0, // + 0, 0, 1, // + + 0, 0, 0, // + 0, 0, 0, // + + 0, 0, 0, // + 0, 0, 0, // + + 0, 1, 0, // + 0, 1, 0, // + }); + // axis 1 + Check<int32_t, TypeParam>(/*input_shape=*/{2, 3}, /*output_shape=*/{2, 4, 3}, + /*input_data=*/ + { + 0, 3, 5, // + 7, 3, 0, // + }, + /*depth_data=*/{4}, /*on_value_data=*/{1}, /*off_value_data=*/{0}, + /*axis=*/1, + /*output_data=*/ + { + 1, 0, 0, // + 0, 0, 0, // + 0, 0, 0, // + 0, 1, 0, // + + 0, 0, 1, // + 0, 0, 0, // + 0, 0, 0, // + 0, 1, 0, // + }); + // axis -1 + Check<int32_t, TypeParam>(/*input_shape=*/{2, 3}, /*output_shape=*/{2, 3, 4}, + /*input_data=*/ + { + 0, 3, 5, // + 7, 3, 0, // + }, + /*depth_data=*/{4}, /*on_value_data=*/{1}, /*off_value_data=*/{0}, + /*axis=*/-1, + /*output_data=*/ + { + 1, 0, 0, 0, // + 0, 0, 0, 1, // + 0, 0, 0, 0, // + + 0, 0, 0, 0, // + 0, 0, 0, 1, // + 1, 0, 0, 0, // + }); +} + +TEST(OneHotTest, UnsupportedInputType_NEG) +{ + std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>(); + + // input type should be integer + Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1}, {0}, memory_manager.get()); + + Tensor depth_tensor = makeInputTensor<DataType::S32>({}, {1}, memory_manager.get()); + Tensor on_value_tensor = makeInputTensor<DataType::FLOAT32>({}, {1.0}, memory_manager.get()); + Tensor off_value_tensor = makeInputTensor<DataType::FLOAT32>({}, {0.0}, memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + OneHotParams params = {-1}; + + OneHot kernel(&input_tensor, &depth_tensor, &on_value_tensor, &off_value_tensor, &output_tensor, + params); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST(OneHotTest, OutputTypeMismatch_NEG) +{ + std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>(); + + Tensor input_tensor = makeInputTensor<DataType::S32>({1}, {0}, memory_manager.get()); + Tensor depth_tensor = makeInputTensor<DataType::S32>({}, {1}, memory_manager.get()); + + // type of on_value, off_value and output_tensor should be same + Tensor on_value_tensor = makeInputTensor<DataType::FLOAT32>({}, {1.0}, memory_manager.get()); + Tensor off_value_tensor = makeInputTensor<DataType::FLOAT32>({}, {0.0}, memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S16); + + OneHotParams params = {-1}; + + OneHot kernel(&input_tensor, &depth_tensor, &on_value_tensor, &off_value_tensor, &output_tensor, + params); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST(OneHotTest, InvalidAxis_NEG) +{ + std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>(); + + Tensor input_tensor = makeInputTensor<DataType::S32>({1}, {0}, memory_manager.get()); + Tensor depth_tensor = makeInputTensor<DataType::S32>({}, {1}, memory_manager.get()); + Tensor on_value_tensor = makeInputTensor<DataType::FLOAT32>({}, {1.0}, memory_manager.get()); + Tensor off_value_tensor = makeInputTensor<DataType::FLOAT32>({}, {0.0}, memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + // axis should be in [-1, input_shape.rank] + OneHotParams params = {-2}; + + OneHot kernel(&input_tensor, &depth_tensor, &on_value_tensor, &off_value_tensor, &output_tensor, + params); + EXPECT_ANY_THROW(kernel.configure()); +} + +} // namespace +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/Pack.test.cpp b/compiler/luci-interpreter/src/kernels/Pack.test.cpp index 90a0f894e..2404e4303 100644 --- a/compiler/luci-interpreter/src/kernels/Pack.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Pack.test.cpp @@ -80,7 +80,7 @@ template <typename T> class PackTest : public ::testing::Test }; using DataTypes = ::testing::Types<uint8_t, float>; -TYPED_TEST_CASE(PackTest, DataTypes); +TYPED_TEST_SUITE(PackTest, DataTypes); TYPED_TEST(PackTest, ThreeInputs) { diff --git a/compiler/luci-interpreter/src/kernels/Pad.cpp b/compiler/luci-interpreter/src/kernels/Pad.cpp index 700448e7a..fe172884b 100644 --- a/compiler/luci-interpreter/src/kernels/Pad.cpp +++ b/compiler/luci-interpreter/src/kernels/Pad.cpp @@ -93,6 +93,16 @@ void Pad::execute() const getTensorData<uint8_t>(output())); break; } + case DataType::S8: + { + assert(output()->zero_point() >= std::numeric_limits<int8_t>::min()); + assert(output()->zero_point() <= std::numeric_limits<int8_t>::max()); + const auto pad_value = static_cast<int8_t>(output()->zero_point()); + tflite::reference_ops::Pad(params, getTensorShape(input()), getTensorData<int8_t>(input()), + &pad_value, getTensorShape(output()), + getTensorData<int8_t>(output())); + break; + } default: throw std::runtime_error("Unsupported type."); } diff --git a/compiler/luci-interpreter/src/kernels/Pad.test.cpp b/compiler/luci-interpreter/src/kernels/Pad.test.cpp index 7994263e2..dd3ce947c 100644 --- a/compiler/luci-interpreter/src/kernels/Pad.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Pad.test.cpp @@ -54,6 +54,32 @@ TEST(Pad, Uint8) EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 4, 7, 1})); } +TEST(Pad, Int8) +{ + std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>(); + float kQuantizedTolerance = GetTolerance(-1.0, 1.0); + std::pair<float, int32_t> quant_param = quantizationParams<int8_t>(-1.0f, 1.0f); + std::vector<float> input_data{-0.2, 0.4, 0.5, -0.7, -0.1, -0.9, 0.7, 0.1, 0.2}; + std::vector<int32_t> paddings_data{0, 0, 1, 2, 2, 1, 0, 0}; + Tensor input_tensor = makeInputTensor<DataType::S8>( + {1, 3, 3, 1}, quant_param.first, quant_param.second, input_data, memory_manager.get()); + Tensor paddings_tensor = + makeInputTensor<DataType::S32>({4, 2}, paddings_data, memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S8, quant_param.first, quant_param.second); + + Pad kernel(&input_tensor, &paddings_tensor, &output_tensor); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + std::vector<float> ref_output_data{0, 0, 0, 0, 0, 0, 0, 0, -0.2, 0.4, 0.5, 0, + 0, 0, -0.7, -0.1, -0.9, 0, 0, 0, 0.7, 0.1, 0.2, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + EXPECT_THAT(dequantizeTensorData(output_tensor), + FloatArrayNear(ref_output_data, kQuantizedTolerance)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 6, 6, 1})); +} + TEST(Pad, Float) { std::unique_ptr<IMemoryManager> memory_manager = std::make_unique<TestMemoryManager>(); diff --git a/compiler/luci-interpreter/src/kernels/Quantize.cpp b/compiler/luci-interpreter/src/kernels/Quantize.cpp new file mode 100644 index 000000000..0c8544a65 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/Quantize.cpp @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/Quantize.h" +#include "kernels/Utils.h" +#include "PALQuantize.h" + +namespace luci_interpreter +{ +namespace kernels +{ + +namespace +{ + +template <typename input_dtype> void call_requantize(const Tensor *input, Tensor *output) +{ + int32_t multiplier; + int shift; + + const double effective_output_scale = input->scale() / output->scale(); + quantizeMultiplier(effective_output_scale, &multiplier, &shift); + + const auto input_shape = getTensorShape(input); + const auto output_shape = getTensorShape(output); + const auto size = tflite::MatchingFlatSize(input_shape, output_shape); + + const auto input_data = getTensorData<input_dtype>(input); + + switch (output->element_type()) + { + case loco::DataType::S8: + luci_interpreter_pal::Requantize(input_data, size, multiplier, shift, input->zero_point(), + output->zero_point(), getTensorData<int8_t>(output)); + break; + case loco::DataType::U8: + luci_interpreter_pal::Requantize(input_data, size, multiplier, shift, input->zero_point(), + output->zero_point(), getTensorData<uint8_t>(output)); + break; + case loco::DataType::S16: + luci_interpreter_pal::Requantize(input_data, size, multiplier, shift, input->zero_point(), + output->zero_point(), getTensorData<int16_t>(output)); + break; + default: + throw std::runtime_error("Unsupported quantized type, yet!"); + } +} + +} // namespace + +Quantize::Quantize(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {} + +void Quantize::configure() +{ + + if (input()->element_type() == loco::DataType::S16) + LUCI_INTERPRETER_CHECK(input()->zero_point() == 0); + + switch (input()->element_type()) + { + case loco::DataType::FLOAT32: + { + LUCI_INTERPRETER_CHECK(output()->element_type() == loco::DataType::U8 || + output()->element_type() == loco::DataType::S8 || + output()->element_type() == loco::DataType::S16); + break; + } + case loco::DataType::S16: + case loco::DataType::S8: + case loco::DataType::U8: + { + LUCI_INTERPRETER_CHECK(output()->element_type() == loco::DataType::S8 || + output()->element_type() == loco::DataType::U8 || + output()->element_type() == loco::DataType::S16); + if (output()->element_type() == loco::DataType::S16) + { + LUCI_INTERPRETER_CHECK(output()->zero_point() == 0); + } + break; + } + default: + throw std::runtime_error("Unsupported type"); + } + + output()->resize(input()->shape()); +} + +void Quantize::execute() const +{ + switch (input()->element_type()) + { + case loco::DataType::FLOAT32: + { + tflite::QuantizationParams op_params; + op_params.zero_point = output()->zero_point(); + op_params.scale = output()->scale(); + const auto input_data = getTensorData<float>(input()); + + switch (output()->element_type()) + { + case loco::DataType::S8: + { + luci_interpreter_pal::Quantize(op_params, getTensorShape(input()), input_data, + getTensorShape(output()), getTensorData<int8_t>(output())); + break; + } + case loco::DataType::U8: + { + luci_interpreter_pal::Quantize(op_params, getTensorShape(input()), input_data, + getTensorShape(output()), + getTensorData<uint8_t>(output())); + break; + } + case loco::DataType::S16: + { + luci_interpreter_pal::Quantize(op_params, getTensorShape(input()), input_data, + getTensorShape(output()), + getTensorData<int16_t>(output())); + break; + } + default: + throw std::runtime_error("Unsupported type."); + } + break; + } + case loco::DataType::S16: + { + call_requantize<int16_t>(input(), output()); + break; + } + case loco::DataType::S8: + { + call_requantize<int8_t>(input(), output()); + break; + } + case loco::DataType::U8: + { + call_requantize<uint8_t>(input(), output()); + break; + } + default: + throw std::runtime_error("Unsupported type."); + } +} + +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/Quantize.h b/compiler/luci-interpreter/src/kernels/Quantize.h new file mode 100644 index 000000000..006c5366f --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/Quantize.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_KERNELS_QUANTIZE_H +#define LUCI_INTERPRETER_KERNELS_QUANTIZE_H + +#include "core/Kernel.h" +#include "core/KernelParams.h" + +namespace luci_interpreter +{ +namespace kernels +{ + +class Quantize : public Kernel +{ +public: + Quantize(const Tensor *input, Tensor *output); + + const Tensor *input() const { return _inputs[0]; } + Tensor *output() const { return _outputs[0]; } + + void configure() override; + void execute() const override; +}; + +} // namespace kernels +} // namespace luci_interpreter + +#endif // LUCI_INTERPRETER_KERNELS_QUANTIZE_H diff --git a/compiler/luci-interpreter/src/kernels/Quantize.test.cpp b/compiler/luci-interpreter/src/kernels/Quantize.test.cpp new file mode 100644 index 000000000..22e67fe3f --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/Quantize.test.cpp @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/Quantize.h" +#include "kernels/TestUtils.h" +#include "luci_interpreter/TestMemoryManager.h" + +namespace luci_interpreter +{ +namespace kernels +{ +namespace +{ + +using namespace testing; + +class QuantizeTest : public ::testing::Test +{ +protected: + void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); } + + std::unique_ptr<IMemoryManager> _memory_manager; +}; + +TEST_F(QuantizeTest, FloatUint8) +{ + std::vector<float> input_data{-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64}; + + std::vector<uint8_t> ref_output_data{0, 1, 2, 3, 4, 251, 252, 253, 254, 255}; + + Tensor input_tensor = + makeInputTensor<DataType::FLOAT32>({2, 5}, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::U8, /*scale*/ 0.5, /*zero_point*/ 127); + + Quantize kernel(&input_tensor, &output_tensor); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<uint8_t>(output_tensor), + ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 5})); +} + +TEST_F(QuantizeTest, FloatInt8) +{ + std::vector<float> input_data{-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64}; + + std::vector<int8_t> ref_output_data{-128, -127, -126, -125, -124, 123, 124, 125, 126, 127}; + + Tensor input_tensor = + makeInputTensor<DataType::FLOAT32>({2, 5}, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S8, /*scale*/ 0.5, /*zero_point*/ -1); + + Quantize kernel(&input_tensor, &output_tensor); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<int8_t>(output_tensor), + ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 5})); +} + +TEST_F(QuantizeTest, FloatInt16) +{ + std::vector<float> input_data{-63.5, -63, -3, -2, -1, 1, 2, 3, 63.5, 64}; + + std::vector<int16_t> ref_output_data{-12700, -12600, -600, -400, -200, + 200, 400, 600, 12700, 12800}; + + Tensor input_tensor = + makeInputTensor<DataType::FLOAT32>({2, 5}, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S16, /*scale*/ 0.005, /*zero_point*/ 0); + + Quantize kernel(&input_tensor, &output_tensor); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<int16_t>(output_tensor), + ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({2, 5})); +} + +TEST_F(QuantizeTest, Int16Int16) +{ + std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + + std::vector<int16_t> ref_output_data{2, 4, 6, 8, 10, 12, 14, 16, 18, 20}; + + Tensor input_tensor = makeInputTensor<DataType::S16>( + {1, 1, 2, 5}, /*scale*/ 1.0, /*zero_point*/ 0, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S16, /*scale*/ 0.5, /*zero_point*/ 0); + + Quantize kernel(&input_tensor, &output_tensor); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<int16_t>(output_tensor), + ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 2, 5})); +} + +TEST_F(QuantizeTest, Int8Int8) +{ + std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + + std::vector<int8_t> ref_output_data{1, 3, 5, 7, 9, 11, 13, 15, 17, 19}; + + Tensor input_tensor = makeInputTensor<DataType::S8>( + {1, 1, 2, 5}, /*scale*/ 0.5, /*zero_point*/ -1, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S8, /*scale*/ 0.5, /*zero_point*/ -1); + + Quantize kernel(&input_tensor, &output_tensor); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<int8_t>(output_tensor), + ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 2, 5})); +} + +TEST_F(QuantizeTest, Uint8Uint8) +{ + std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + + std::vector<uint8_t> ref_output_data{129, 131, 133, 135, 137, 139, 141, 143, 145, 147}; + + Tensor input_tensor = makeInputTensor<DataType::U8>( + {1, 1, 2, 5}, /*scale*/ 0.5, /*zero_point*/ 127, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::U8, /*scale*/ 0.5, /*zero_point*/ 127); + + Quantize kernel(&input_tensor, &output_tensor); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<uint8_t>(output_tensor), + ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 2, 5})); +} + +TEST_F(QuantizeTest, Int16Int8) +{ + std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + + std::vector<int8_t> ref_output_data{1, 3, 5, 7, 9, 11, 13, 15, 17, 19}; + + Tensor input_tensor = makeInputTensor<DataType::S16>( + {1, 1, 2, 5}, /*scale*/ 1.0, /*zero_point*/ 0, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S8, /*scale*/ 0.5, /*zero_point*/ -1); + + Quantize kernel(&input_tensor, &output_tensor); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<int8_t>(output_tensor), + ::testing::ElementsAreArray(ref_output_data)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 2, 5})); +} + +TEST_F(QuantizeTest, InvalidInputType_NEG) +{ + std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + + Tensor input_tensor = + makeInputTensor<DataType::S32>({1, 1, 2, 5}, 0.5, 0, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S8, /*scale*/ 0.5, /*zero_point*/ -1); + + Quantize kernel(&input_tensor, &output_tensor); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(QuantizeTest, InvalidOutputTypeForFloatInput_NEG) +{ + std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + + Tensor input_tensor = + makeInputTensor<DataType::FLOAT32>({1, 1, 2, 5}, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + Quantize kernel(&input_tensor, &output_tensor); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(QuantizeTest, InvalidOutputTypeForInt16Input_NEG) +{ + std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + + Tensor input_tensor = + makeInputTensor<DataType::S16>({1, 1, 2, 5}, 0.5, 0, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + Quantize kernel(&input_tensor, &output_tensor); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(QuantizeTest, InvalidOutputTypeForInt8Input_NEG) +{ + std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + + Tensor input_tensor = + makeInputTensor<DataType::S8>({1, 1, 2, 5}, 0.5, 0, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + Quantize kernel(&input_tensor, &output_tensor); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(QuantizeTest, InvalidOutputTypeForUint8Input_NEG) +{ + std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + + Tensor input_tensor = + makeInputTensor<DataType::U8>({1, 1, 2, 5}, 0.5, 0, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S32); + + Quantize kernel(&input_tensor, &output_tensor); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(QuantizeTest, InvalidInputZeroPoint_NEG) +{ + std::vector<float> input_data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + + Tensor input_tensor = + makeInputTensor<DataType::S16>({1, 1, 2, 5}, 0.5, -1, input_data, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0); + + Quantize kernel(&input_tensor, &output_tensor); + EXPECT_ANY_THROW(kernel.configure()); +} + +} // namespace +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/ResizeBilinear.test.cpp b/compiler/luci-interpreter/src/kernels/ResizeBilinear.test.cpp index 7af20f8c4..933a1128c 100644 --- a/compiler/luci-interpreter/src/kernels/ResizeBilinear.test.cpp +++ b/compiler/luci-interpreter/src/kernels/ResizeBilinear.test.cpp @@ -90,7 +90,7 @@ template <typename T> class ResizeBilinearTest : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t>; -TYPED_TEST_CASE(ResizeBilinearTest, DataTypes); +TYPED_TEST_SUITE(ResizeBilinearTest, DataTypes); TYPED_TEST(ResizeBilinearTest, SimpleTest) { diff --git a/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp b/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp index 0e9017c78..7ade02a6f 100644 --- a/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp +++ b/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp @@ -92,7 +92,7 @@ template <typename T> class ResizeNearestNeighborTest : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t>; -TYPED_TEST_CASE(ResizeNearestNeighborTest, DataTypes); +TYPED_TEST_SUITE(ResizeNearestNeighborTest, DataTypes); TYPED_TEST(ResizeNearestNeighborTest, SimpleTest) { diff --git a/compiler/luci-interpreter/src/kernels/ReverseV2.test.cpp b/compiler/luci-interpreter/src/kernels/ReverseV2.test.cpp index 2bd94875b..c0025faca 100644 --- a/compiler/luci-interpreter/src/kernels/ReverseV2.test.cpp +++ b/compiler/luci-interpreter/src/kernels/ReverseV2.test.cpp @@ -33,7 +33,7 @@ template <typename T> class ReverseV2Test : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t>; -TYPED_TEST_CASE(ReverseV2Test, DataTypes); +TYPED_TEST_SUITE(ReverseV2Test, DataTypes); TYPED_TEST(ReverseV2Test, MultiDimensions) { diff --git a/compiler/luci-interpreter/src/kernels/SVDF.cpp b/compiler/luci-interpreter/src/kernels/SVDF.cpp new file mode 100644 index 000000000..40d79aaa3 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/SVDF.cpp @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/SVDF.h" +#include "kernels/Utils.h" +#include "PALSVDF.h" + +#include <tensorflow/lite/kernels/internal/quantization_util.h> + +namespace luci_interpreter +{ +namespace kernels +{ + +namespace +{ +TfLiteFusedActivation get_tflite_activation(Activation activation) +{ + switch (activation) + { + case luci::FusedActFunc::RELU: + return kTfLiteActRelu; + case luci::FusedActFunc::RELU6: + return kTfLiteActRelu6; + case luci::FusedActFunc::RELU_N1_TO_1: + return kTfLiteActReluN1To1; + case luci::FusedActFunc::TANH: + return kTfLiteActTanh; + case luci::FusedActFunc::SIGN_BIT: + return kTfLiteActSignBit; + case luci::FusedActFunc::NONE: + return kTfLiteActNone; + default: + throw std::runtime_error("Unsupported activation type"); + } +} +} // namespace + +SVDF::SVDF(const Tensor *input, const Tensor *weight_feature, const Tensor *weight_time, + const Tensor *bias, const Tensor *input_activation_state, Tensor *output, + Tensor *scratchpad_activation_state, Tensor *scratchpad_1, Tensor *scratchpad_2, + Tensor *scratchpad_3, Tensor *scratchpad_4, Tensor *scratchpad_5, Tensor *scratchpad_6, + const SVDFParams ¶ms) + : KernelWithParams<SVDFParams>({input, weight_feature, weight_time, bias, input_activation_state}, + {output, scratchpad_activation_state, scratchpad_1, scratchpad_2, + scratchpad_3, scratchpad_4, scratchpad_5, scratchpad_6}, + params) +{ + // Do nothing +} + +void SVDF::configure() +{ + const Shape &input_shape = input()->shape(); + const Shape &weight_features_shape = weight_feature()->shape(); + const Shape &weight_time_shape = weight_time()->shape(); + + // Validate Input Tensor: + LUCI_INTERPRETER_CHECK(input()->element_type() == loco::DataType::FLOAT32 || + input()->element_type() == loco::DataType::S8); + LUCI_INTERPRETER_CHECK(input_shape.num_dims() == 2); + + // Validate inputs and output types + if (input()->element_type() == loco::DataType::S8) + { + LUCI_INTERPRETER_CHECK(weight_feature()->element_type() == loco::DataType::S8); + LUCI_INTERPRETER_CHECK(weight_time()->element_type() == loco::DataType::S16 || + weight_time()->element_type() == loco::DataType::S8); + if (bias()) + LUCI_INTERPRETER_CHECK(bias()->element_type() == loco::DataType::S32); + + LUCI_INTERPRETER_CHECK(input_activation_state()->element_type() == loco::DataType::S16 || + input_activation_state()->element_type() == loco::DataType::S8); + LUCI_INTERPRETER_CHECK(output()->element_type() == loco::DataType::S8); + + // Note: now tflite support only ReLU activation for integer SVDF + LUCI_INTERPRETER_CHECK(params().activation == luci::FusedActFunc::RELU); + } + else if (weight_feature()->element_type() == loco::DataType::FLOAT32) + { + LUCI_INTERPRETER_CHECK(weight_feature()->element_type() == loco::DataType::FLOAT32); + LUCI_INTERPRETER_CHECK(weight_time()->element_type() == loco::DataType::FLOAT32); + LUCI_INTERPRETER_CHECK(input_activation_state()->element_type() == loco::DataType::FLOAT32); + if (bias()) + LUCI_INTERPRETER_CHECK(bias()->element_type() == loco::DataType::FLOAT32); + LUCI_INTERPRETER_CHECK(output()->element_type() == loco::DataType::FLOAT32); + } + else if ((weight_feature()->element_type() == loco::DataType::U8 || + weight_feature()->element_type() == loco::DataType::S8) && + input()->element_type() == loco::DataType::FLOAT32) + { + // TODO:: support hybrid SVDF op + throw std::runtime_error("Hybrid type is not currently supported"); + } + else + { + throw std::runtime_error("Unsupported type."); + } + + // Check all the parameters of tensor match within themselves and match the + // input configuration. + const int rank = params().svdf_rank; + const int batch_size = input_shape.dim(0); + const int num_filters = weight_features_shape.dim(0); + LUCI_INTERPRETER_CHECK(rank != 0); + LUCI_INTERPRETER_CHECK(num_filters % rank == 0); + + const int num_units = num_filters / rank; + const int memory_size = weight_time_shape.dim(1); + + // Validate Weight_Feature Input Tensor: + LUCI_INTERPRETER_CHECK(weight_features_shape.num_dims() == 2); + LUCI_INTERPRETER_CHECK(weight_features_shape.dim(1) == input_shape.dim(1)); + + // Validate Weight_Time Input Tensor: + LUCI_INTERPRETER_CHECK(weight_time_shape.num_dims() == 2); + LUCI_INTERPRETER_CHECK(weight_time_shape.dim(0) == num_filters); + + // Validate Bias + if (bias()) + LUCI_INTERPRETER_CHECK(bias()->shape().dim(0) == num_units); + + // Validate Input Activation State + LUCI_INTERPRETER_CHECK(input_activation_state()->shape().num_dims() == 2); + LUCI_INTERPRETER_CHECK(input_activation_state()->shape().dim(0) == batch_size); + LUCI_INTERPRETER_CHECK(input_activation_state()->shape().dim(1) == memory_size * num_filters); + + // Resize scratchpad_state to input_activation_state + auto scratchpad_activation_state = getOutputTensors()[1]; + scratchpad_activation_state->resize({batch_size, memory_size * num_filters}); + + // Resize output tensor + output()->resize({batch_size, num_units}); + + luci_interpreter_pal::SetupScratchpadTensor( + input()->element_type(), weight_feature()->element_type(), getOutputTensors()[2], + getOutputTensors()[3], getOutputTensors()[4], getOutputTensors()[5], getOutputTensors()[6], + getOutputTensors()[7], input_shape, weight_time_shape, batch_size, num_filters, num_units); +} + +void SVDF::execute() const +{ + switch (weight_feature()->element_type()) + { + case loco::DataType::FLOAT32: + evalFloat(); + break; + case loco::DataType::S8: + { + if (input()->element_type() == loco::DataType::S8) + evalInteger(); + else + // TODO:: support hybrid SVDF op + throw std::runtime_error("Hybrid type is not currently supported"); + break; + } + default: + throw std::runtime_error("Unsupported type"); + } +} + +void SVDF::evalInteger() const +{ + const auto effective_scale_1 = static_cast<double>(input()->scale() * weight_feature()->scale() / + input_activation_state()->scale()); + const auto effective_scale_2 = static_cast<double>(input_activation_state()->scale() * + weight_time()->scale() / output()->scale()); + + int32_t effective_scale_1_a; + int effective_scale_1_b; + int32_t effective_scale_2_a; + int effective_scale_2_b; + + tflite::QuantizeMultiplier(effective_scale_1, &effective_scale_1_a, &effective_scale_1_b); + tflite::QuantizeMultiplier(effective_scale_2, &effective_scale_2_a, &effective_scale_2_b); + + TfLiteSVDFParams params_svdf{}; + params_svdf.asymmetric_quantize_inputs = params().asymmetric_quantize_inputs; + params_svdf.rank = params().svdf_rank; + params_svdf.activation = get_tflite_activation(params().activation); + + auto scratchpad_activation_state = getOutputTensors()[1]; + // Note: it is expected that activation_state input variable tensor reset to zero, + // also expected that this variable tensor doesn't have buffer + auto scratchpad_data = getTensorData<int16_t>(scratchpad_activation_state); + std::fill_n(scratchpad_data, scratchpad_activation_state->shape().num_elements(), 0); + + auto scratchpad = getOutputTensors()[2]; + auto output_temp = getOutputTensors()[3]; + + int32_t input_zp = input()->zero_point(); + int32_t output_zp = output()->zero_point(); + luci_interpreter_pal::IntegerSVDF( + params_svdf, getTensorShape(input()), getTensorData<int8_t>(input()), + getTensorShape(weight_feature()), getTensorData<int8_t>(weight_feature()), + getTensorShape(weight_time()), getTensorData<int16_t>(weight_time()), getTensorShape(bias()), + getTensorData<int32_t>(bias()), scratchpad_data, getTensorShape(output()), + getTensorData<int8_t>(output()), getTensorData<int32_t>(scratchpad), + getTensorData<int32_t>(output_temp), effective_scale_1_a, effective_scale_1_b, + effective_scale_2_a, effective_scale_2_b, input_zp, output_zp); +} + +void SVDF::evalFloat() const +{ + TfLiteSVDFParams params_svdf{}; + params_svdf.asymmetric_quantize_inputs = params().asymmetric_quantize_inputs; + params_svdf.rank = params().svdf_rank; + params_svdf.activation = get_tflite_activation(params().activation); + + auto scratchpad_activation_state = getOutputTensors()[1]; + // Note: it is expected that activation_state input variable tensor reset to zero, + // also expected that this variable tensor doesn't have buffer + auto scratchpad_data = getTensorData<float>(scratchpad_activation_state); + std::fill_n(scratchpad_data, scratchpad_activation_state->shape().num_elements(), 0); + + auto scratchpad_1 = getOutputTensors()[2]; + + luci_interpreter_pal::FloatSVDF( + params_svdf, getTensorShape(input()), getTensorData<float>(input()), + getTensorShape(weight_feature()), getTensorData<float>(weight_feature()), + getTensorShape(weight_time()), getTensorData<float>(weight_time()), getTensorShape(bias()), + getTensorData<float>(bias()), getTensorData<float>(scratchpad_1), scratchpad_data, + getTensorShape(output()), getTensorData<float>(output())); +} + +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/SVDF.h b/compiler/luci-interpreter/src/kernels/SVDF.h new file mode 100644 index 000000000..335a6cd8f --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/SVDF.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_KERNELS_SVDF_H +#define LUCI_INTERPRETER_KERNELS_SVDF_H + +#include "core/Kernel.h" +#include "core/KernelParams.h" + +namespace luci_interpreter +{ +namespace kernels +{ + +class SVDF : public KernelWithParams<SVDFParams> +{ +public: + SVDF(const Tensor *input, const Tensor *weight_feature, const Tensor *weight_time, + const Tensor *bias, const Tensor *input_activation_state, Tensor *output, + Tensor *scratchpad_activation_state, Tensor *scratchpad_1, Tensor *scratchpad_2, + Tensor *scratchpad_3, Tensor *scratchpad_4, Tensor *scratchpad_5, Tensor *scratchpad_6, + const SVDFParams ¶ms); + + const Tensor *input() const { return _inputs[0]; } + const Tensor *weight_feature() const { return _inputs[1]; } + const Tensor *weight_time() const { return _inputs[2]; } + const Tensor *bias() const { return _inputs[3]; } + const Tensor *input_activation_state() const { return _inputs[4]; } + + Tensor *output() const { return _outputs[0]; } + + void configure() override; + void execute() const override; + +private: + void evalFloat() const; + void evalInteger() const; +}; + +} // namespace kernels +} // namespace luci_interpreter + +#endif // LUCI_INTERPRETER_KERNELS_SVDF_H diff --git a/compiler/luci-interpreter/src/kernels/SVDF.test.cpp b/compiler/luci-interpreter/src/kernels/SVDF.test.cpp new file mode 100644 index 000000000..82bd9b009 --- /dev/null +++ b/compiler/luci-interpreter/src/kernels/SVDF.test.cpp @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernels/SVDF.h" +#include "kernels/TestUtils.h" +#include "luci_interpreter/TestMemoryManager.h" + +namespace luci_interpreter +{ +namespace kernels +{ +namespace +{ + +using namespace testing; + +class SVDFTest : public ::testing::Test +{ +protected: + void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); } + + std::unique_ptr<IMemoryManager> _memory_manager; +}; + +TEST_F(SVDFTest, FullIntegerTest) +{ + const int32_t batches = 2; + const int32_t input_size = 3; + const int32_t units = 4; + const int32_t memory_size = 10; + const int32_t rank = 1; + const int32_t num_filters = units * rank; + + Shape input_shape{batches, input_size}; + Shape weight_feature_shape{num_filters, input_size}; + Shape weight_time_shape{num_filters, memory_size}; + Shape bias_shape{units}; + Shape activation_state_shape{batches, memory_size * num_filters}; + + std::vector<float> input_data{0.49837467, 0.19278903, 0.26584083, + 0.17660543, 0.52949083, -0.77931279}; + + std::vector<float> weight_feature_data{-0.31930989, -0.36118156, 0.0079667, 0.37613347, + 0.22197971, 0.12416199, 0.27901134, 0.27557442, + 0.3905206, -0.36137494, -0.06634006, -0.10640851}; + + std::vector<float> weight_time_data{ + -0.31930989, 0.37613347, 0.27901134, -0.36137494, -0.36118156, + 0.22197971, 0.27557442, -0.06634006, 0.0079667, 0.12416199, + + 0.3905206, -0.10640851, -0.0976817, 0.15294972, 0.39635518, + -0.02702999, 0.39296314, 0.15785322, 0.21931258, 0.31053296, + + -0.36916667, 0.38031587, -0.21580373, 0.27072677, 0.23622236, + 0.34936687, 0.18174365, 0.35907319, -0.17493086, 0.324846, + + -0.10781813, 0.27201805, 0.14324132, -0.23681851, -0.27115166, + -0.01580888, -0.14943552, 0.15465137, 0.09784451, -0.0337657}; + + std::vector<float> bias_data{-0.0976817, 0.15294972, 0.39635518, -0.02702999}; + + std::pair<float, int32_t> input_quant_param = quantizationParams<int8_t>(-1, 1); + std::pair<float, int32_t> weight_feature_quant_param = quantizationParams<int8_t>(-0.5, 0.5); + std::pair<float, int32_t> weight_time_quant_param = quantizationParams<int16_t>(-1, 1); + std::pair<float, int32_t> bias_quant_param = quantizationParams<int32_t>(-512, 512); + std::pair<float, int32_t> activation_state_quant_param = quantizationParams<int16_t>(-16, 16); + + std::pair<float, int32_t> output_quant_param = quantizationParams<int8_t>(-0.5, 0.5); + + Tensor input_tensor = + makeInputTensor<DataType::S8>(input_shape, input_quant_param.first, input_quant_param.second, + input_data, _memory_manager.get()); + Tensor weight_feature_tensor = makeInputTensor<DataType::S8>( + weight_feature_shape, weight_feature_quant_param.first, weight_feature_quant_param.second, + weight_feature_data, _memory_manager.get()); + Tensor weight_time_tensor = makeInputTensor<DataType::S16>( + weight_time_shape, weight_time_quant_param.first, weight_time_quant_param.second, + weight_time_data, _memory_manager.get()); + Tensor bias_tensor = makeInputTensor<DataType::S32>( + bias_shape, bias_quant_param.first, bias_quant_param.second, bias_data, _memory_manager.get()); + Tensor activation_state_tensor = makeOutputTensor( + DataType::S16, activation_state_quant_param.first, activation_state_quant_param.second); + activation_state_tensor.resize(activation_state_shape); + Tensor output_tensor = + makeOutputTensor(DataType::S8, output_quant_param.first, output_quant_param.second); + + Tensor scratchpad_activation_state(DataType::S16, Shape({}), {}, ""); + Tensor scratchpad_1(DataType::S32, Shape({}), {}, ""); + Tensor scratchpad_2(DataType::S32, Shape({}), {}, ""); + Tensor scratchpad_3(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_4(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_5(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_6(DataType::FLOAT32, Shape({}), {}, ""); + + SVDFParams params{}; + params.activation = Activation::RELU; + params.asymmetric_quantize_inputs = false; + params.svdf_rank = rank; + + SVDF kernel(&input_tensor, &weight_feature_tensor, &weight_time_tensor, &bias_tensor, + &activation_state_tensor, &output_tensor, &scratchpad_activation_state, &scratchpad_1, + &scratchpad_2, &scratchpad_3, &scratchpad_4, &scratchpad_5, &scratchpad_6, params); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + _memory_manager->allocate_memory(scratchpad_activation_state); + _memory_manager->allocate_memory(scratchpad_1); + _memory_manager->allocate_memory(scratchpad_2); + _memory_manager->allocate_memory(scratchpad_3); + _memory_manager->allocate_memory(scratchpad_4); + _memory_manager->allocate_memory(scratchpad_5); + _memory_manager->allocate_memory(scratchpad_6); + kernel.execute(); + + std::vector<int8_t> ref_output_data{-9, 24, 31, 1, -10, 10, -3, 0}; + + std::vector<int32_t> ref_output_shape{batches, units}; + EXPECT_THAT(extractTensorData<int8_t>(output_tensor), ref_output_data); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape)); +} + +TEST_F(SVDFTest, FloatTest) +{ + const int32_t batches = 2; + const int32_t input_size = 3; + const int32_t units = 4; + const int32_t memory_size = 10; + const int32_t rank = 1; + const int32_t num_filters = units * rank; + + Shape input_shape{batches, input_size}; + Shape weight_feature_shape{num_filters, input_size}; + Shape weight_time_shape{num_filters, memory_size}; + Shape activation_state_shape{batches, memory_size * num_filters}; + + std::vector<float> input_data{0.12609188, -0.46347019, -0.89598465, + 0.35867718, 0.36897406, 0.73463392}; + + std::vector<float> weight_feature_data{-0.31930989, -0.36118156, 0.0079667, 0.37613347, + 0.22197971, 0.12416199, 0.27901134, 0.27557442, + 0.3905206, -0.36137494, -0.06634006, -0.10640851}; + + std::vector<float> weight_time_data{ + -0.31930989, 0.37613347, 0.27901134, -0.36137494, -0.36118156, + 0.22197971, 0.27557442, -0.06634006, 0.0079667, 0.12416199, + + 0.3905206, -0.10640851, -0.0976817, 0.15294972, 0.39635518, + -0.02702999, 0.39296314, 0.15785322, 0.21931258, 0.31053296, + + -0.36916667, 0.38031587, -0.21580373, 0.27072677, 0.23622236, + 0.34936687, 0.18174365, 0.35907319, -0.17493086, 0.324846, + + -0.10781813, 0.27201805, 0.14324132, -0.23681851, -0.27115166, + -0.01580888, -0.14943552, 0.15465137, 0.09784451, -0.0337657}; + + Tensor input_tensor = + makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get()); + Tensor weight_feature_tensor = makeInputTensor<DataType::FLOAT32>( + weight_feature_shape, weight_feature_data, _memory_manager.get()); + Tensor weight_time_tensor = + makeInputTensor<DataType::FLOAT32>(weight_time_shape, weight_time_data, _memory_manager.get()); + Tensor activation_state_tensor = makeOutputTensor(DataType::FLOAT32); + activation_state_tensor.resize(activation_state_shape); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + Tensor scratchpad_activation_state(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_1(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_2(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_3(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_4(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_5(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_6(DataType::FLOAT32, Shape({}), {}, ""); + + SVDFParams params{}; + params.activation = Activation::NONE; + params.asymmetric_quantize_inputs = false; + params.svdf_rank = rank; + + SVDF kernel(&input_tensor, &weight_feature_tensor, &weight_time_tensor, nullptr, + &activation_state_tensor, &output_tensor, &scratchpad_activation_state, &scratchpad_1, + &scratchpad_2, &scratchpad_3, &scratchpad_4, &scratchpad_5, &scratchpad_6, params); + kernel.configure(); + _memory_manager->allocate_memory(output_tensor); + _memory_manager->allocate_memory(scratchpad_activation_state); + _memory_manager->allocate_memory(scratchpad_1); + _memory_manager->allocate_memory(scratchpad_2); + _memory_manager->allocate_memory(scratchpad_3); + _memory_manager->allocate_memory(scratchpad_4); + _memory_manager->allocate_memory(scratchpad_5); + _memory_manager->allocate_memory(scratchpad_6); + kernel.execute(); + + std::vector<float> ref_output_data{0.014899, -0.0517661, -0.143725, -0.00271883, + -0.03004015, 0.09565311, 0.1587342, 0.00784263}; + + std::vector<float> ref_output_shape{batches, units}; + const float tolerance = 1e-5; + EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data, tolerance)); + EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape)); +} + +TEST_F(SVDFTest, Unsupported_Type_Configure_NEG) +{ + const int32_t batches = 2; + const int32_t input_size = 3; + const int32_t units = 4; + const int32_t memory_size = 10; + const int32_t rank = 1; + const int32_t num_filters = units * rank; + + Shape input_shape{batches, input_size}; + Shape weight_feature_shape{num_filters, input_size}; + Shape weight_time_shape{num_filters, memory_size}; + Shape activation_state_shape{batches, memory_size * num_filters}; + + std::vector<int32_t> input_data{0, 1, 3, 4, 4, -2}; + + std::vector<float> weight_feature_data{-0.31930989, -0.36118156, 0.0079667, 0.37613347, + 0.22197971, 0.12416199, 0.27901134, 0.27557442, + 0.3905206, -0.36137494, -0.06634006, -0.10640851}; + + std::vector<float> weight_time_data{ + -0.31930989, 0.37613347, 0.27901134, -0.36137494, -0.36118156, + 0.22197971, 0.27557442, -0.06634006, 0.0079667, 0.12416199, + + 0.3905206, -0.10640851, -0.0976817, 0.15294972, 0.39635518, + -0.02702999, 0.39296314, 0.15785322, 0.21931258, 0.31053296, + + -0.36916667, 0.38031587, -0.21580373, 0.27072677, 0.23622236, + 0.34936687, 0.18174365, 0.35907319, -0.17493086, 0.324846, + + -0.10781813, 0.27201805, 0.14324132, -0.23681851, -0.27115166, + -0.01580888, -0.14943552, 0.15465137, 0.09784451, -0.0337657}; + + Tensor input_tensor = + makeInputTensor<DataType::S32>(input_shape, input_data, _memory_manager.get()); + Tensor weight_feature_tensor = makeInputTensor<DataType::FLOAT32>( + weight_feature_shape, weight_feature_data, _memory_manager.get()); + Tensor weight_time_tensor = + makeInputTensor<DataType::FLOAT32>(weight_time_shape, weight_time_data, _memory_manager.get()); + Tensor activation_state_tensor = makeOutputTensor(DataType::FLOAT32); + activation_state_tensor.resize(activation_state_shape); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + Tensor scratchpad_activation_state(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_1(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_2(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_3(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_4(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_5(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_6(DataType::FLOAT32, Shape({}), {}, ""); + + SVDFParams params{}; + params.activation = Activation::NONE; + params.asymmetric_quantize_inputs = false; + params.svdf_rank = rank; + + SVDF kernel(&input_tensor, &weight_feature_tensor, &weight_time_tensor, nullptr, + &activation_state_tensor, &output_tensor, &scratchpad_activation_state, &scratchpad_1, + &scratchpad_2, &scratchpad_3, &scratchpad_4, &scratchpad_5, &scratchpad_6, params); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(SVDFTest, Invalid_Input_Shape_NEG) +{ + const int32_t batches = 2; + const int32_t right_input_size = 3; + const int32_t wrong_input_size = 4; + const int32_t units = 4; + const int32_t memory_size = 10; + const int32_t rank = 1; + const int32_t num_filters = units * rank; + + Shape input_shape{batches, wrong_input_size}; + Shape weight_feature_shape{num_filters, right_input_size}; + Shape weight_time_shape{num_filters, memory_size}; + Shape activation_state_shape{batches, memory_size * num_filters}; + + std::vector<float> input_data{0, 1, 3, 2, 4, 4, -2, 1}; + + std::vector<float> weight_feature_data{-0.31930989, -0.36118156, 0.0079667, 0.37613347, + 0.22197971, 0.12416199, 0.27901134, 0.27557442, + 0.3905206, -0.36137494, -0.06634006, -0.10640851}; + + std::vector<float> weight_time_data{ + -0.31930989, 0.37613347, 0.27901134, -0.36137494, -0.36118156, + 0.22197971, 0.27557442, -0.06634006, 0.0079667, 0.12416199, + + 0.3905206, -0.10640851, -0.0976817, 0.15294972, 0.39635518, + -0.02702999, 0.39296314, 0.15785322, 0.21931258, 0.31053296, + + -0.36916667, 0.38031587, -0.21580373, 0.27072677, 0.23622236, + 0.34936687, 0.18174365, 0.35907319, -0.17493086, 0.324846, + + -0.10781813, 0.27201805, 0.14324132, -0.23681851, -0.27115166, + -0.01580888, -0.14943552, 0.15465137, 0.09784451, -0.0337657}; + + Tensor input_tensor = + makeInputTensor<DataType::FLOAT32>(input_shape, input_data, _memory_manager.get()); + Tensor weight_feature_tensor = makeInputTensor<DataType::FLOAT32>( + weight_feature_shape, weight_feature_data, _memory_manager.get()); + Tensor weight_time_tensor = + makeInputTensor<DataType::FLOAT32>(weight_time_shape, weight_time_data, _memory_manager.get()); + Tensor activation_state_tensor = makeOutputTensor(DataType::FLOAT32); + activation_state_tensor.resize(activation_state_shape); + Tensor output_tensor = makeOutputTensor(DataType::FLOAT32); + + Tensor scratchpad_activation_state(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_1(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_2(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_3(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_4(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_5(DataType::FLOAT32, Shape({}), {}, ""); + Tensor scratchpad_6(DataType::FLOAT32, Shape({}), {}, ""); + + SVDFParams params{}; + params.activation = Activation::NONE; + params.asymmetric_quantize_inputs = false; + params.svdf_rank = rank; + + SVDF kernel(&input_tensor, &weight_feature_tensor, &weight_time_tensor, nullptr, + &activation_state_tensor, &output_tensor, &scratchpad_activation_state, &scratchpad_1, + &scratchpad_2, &scratchpad_3, &scratchpad_4, &scratchpad_5, &scratchpad_6, params); + EXPECT_ANY_THROW(kernel.configure()); +} + +} // namespace +} // namespace kernels +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/Slice.cpp b/compiler/luci-interpreter/src/kernels/Slice.cpp index 37a834a18..2fe2c5471 100644 --- a/compiler/luci-interpreter/src/kernels/Slice.cpp +++ b/compiler/luci-interpreter/src/kernels/Slice.cpp @@ -139,6 +139,11 @@ void Slice::execute() const getTensorData<uint8_t>(input()), getTensorShape(output()), getTensorData<uint8_t>(output())); break; + case DataType::S8: + luci_interpreter_pal::Slice(op_params, getTensorShape(input()), + getTensorData<int8_t>(input()), getTensorShape(output()), + getTensorData<int8_t>(output())); + break; default: throw std::runtime_error("Unsupported input type."); } diff --git a/compiler/luci-interpreter/src/kernels/Slice.test.cpp b/compiler/luci-interpreter/src/kernels/Slice.test.cpp index 3e0d0b0d7..517982990 100644 --- a/compiler/luci-interpreter/src/kernels/Slice.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Slice.test.cpp @@ -31,8 +31,8 @@ template <typename T> class SliceTest : public ::testing::Test { }; -using DataTypes = ::testing::Types<float, uint8_t>; -TYPED_TEST_CASE(SliceTest, DataTypes); +using DataTypes = ::testing::Types<float, uint8_t, int8_t>; +TYPED_TEST_SUITE(SliceTest, DataTypes); TYPED_TEST(SliceTest, SimpleTest) { diff --git a/compiler/luci-interpreter/src/kernels/Softmax.test.cpp b/compiler/luci-interpreter/src/kernels/Softmax.test.cpp index 9de40b6ec..08e70672d 100644 --- a/compiler/luci-interpreter/src/kernels/Softmax.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Softmax.test.cpp @@ -93,7 +93,7 @@ template <typename T> class SoftmaxTest : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t, int8_t>; -TYPED_TEST_CASE(SoftmaxTest, DataTypes); +TYPED_TEST_SUITE(SoftmaxTest, DataTypes); TYPED_TEST(SoftmaxTest, Simple) { diff --git a/compiler/luci-interpreter/src/kernels/SpaceToBatchND.test.cpp b/compiler/luci-interpreter/src/kernels/SpaceToBatchND.test.cpp index e06501c8c..3a8b0a812 100644 --- a/compiler/luci-interpreter/src/kernels/SpaceToBatchND.test.cpp +++ b/compiler/luci-interpreter/src/kernels/SpaceToBatchND.test.cpp @@ -90,7 +90,7 @@ template <typename T> class SpaceToBatchNDTest : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t>; -TYPED_TEST_CASE(SpaceToBatchNDTest, DataTypes); +TYPED_TEST_SUITE(SpaceToBatchNDTest, DataTypes); TYPED_TEST(SpaceToBatchNDTest, Simple) { diff --git a/compiler/luci-interpreter/src/kernels/SpaceToDepth.test.cpp b/compiler/luci-interpreter/src/kernels/SpaceToDepth.test.cpp index 735c010b9..4af488618 100644 --- a/compiler/luci-interpreter/src/kernels/SpaceToDepth.test.cpp +++ b/compiler/luci-interpreter/src/kernels/SpaceToDepth.test.cpp @@ -32,7 +32,7 @@ template <typename T> class SpaceToDepthTest : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t>; -TYPED_TEST_CASE(SpaceToDepthTest, DataTypes); +TYPED_TEST_SUITE(SpaceToDepthTest, DataTypes); TYPED_TEST(SpaceToDepthTest, SimpleCase) { diff --git a/compiler/luci-interpreter/src/kernels/Split.test.cpp b/compiler/luci-interpreter/src/kernels/Split.test.cpp index 74d57aed3..283cd9aa9 100644 --- a/compiler/luci-interpreter/src/kernels/Split.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Split.test.cpp @@ -73,7 +73,7 @@ template <typename T> class SplitTest : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t>; -TYPED_TEST_CASE(SplitTest, DataTypes); +TYPED_TEST_SUITE(SplitTest, DataTypes); TYPED_TEST(SplitTest, FourDimensional) { diff --git a/compiler/luci-interpreter/src/kernels/SplitV.test.cpp b/compiler/luci-interpreter/src/kernels/SplitV.test.cpp index aac0567d7..035bc2122 100644 --- a/compiler/luci-interpreter/src/kernels/SplitV.test.cpp +++ b/compiler/luci-interpreter/src/kernels/SplitV.test.cpp @@ -77,7 +77,7 @@ template <typename T> class SplitVTest : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t, int16_t>; -TYPED_TEST_CASE(SplitVTest, DataTypes); +TYPED_TEST_SUITE(SplitVTest, DataTypes); TYPED_TEST(SplitVTest, ThreeDimensional) { diff --git a/compiler/luci-interpreter/src/kernels/Squeeze.test.cpp b/compiler/luci-interpreter/src/kernels/Squeeze.test.cpp index d3326fe98..1bc0b6459 100644 --- a/compiler/luci-interpreter/src/kernels/Squeeze.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Squeeze.test.cpp @@ -56,7 +56,7 @@ template <typename T> class SqueezeTest : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t>; -TYPED_TEST_CASE(SqueezeTest, DataTypes); +TYPED_TEST_SUITE(SqueezeTest, DataTypes); TYPED_TEST(SqueezeTest, TotalTest) { diff --git a/compiler/luci-interpreter/src/kernels/Sub.cpp b/compiler/luci-interpreter/src/kernels/Sub.cpp index 603c62d0f..24b6a72e5 100644 --- a/compiler/luci-interpreter/src/kernels/Sub.cpp +++ b/compiler/luci-interpreter/src/kernels/Sub.cpp @@ -37,6 +37,7 @@ Sub::Sub(const Tensor *input1, const Tensor *input2, Tensor *output, const SubPa void Sub::configure() { LUCI_INTERPRETER_CHECK(!(input1()->element_type() != input2()->element_type())) + LUCI_INTERPRETER_CHECK(!(input1()->element_type() != output()->element_type())) output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape())); } @@ -47,6 +48,12 @@ void Sub::execute() const case DataType::FLOAT32: evalFloat(); break; + case DataType::S64: + evalInteger<int64_t>(); + break; + case DataType::S32: + evalInteger<int32_t>(); + break; case DataType::U8: evalQuantized(); break; @@ -57,13 +64,8 @@ void Sub::execute() const void Sub::evalFloat() const { - float activation_min{}; - float activation_max{}; - calculateActivationRange(_params.activation, &activation_min, &activation_max); - tflite::ArithmeticParams params{}; - params.float_activation_min = activation_min; - params.float_activation_max = activation_max; + fillArithmeticActivationRange<float>(params, _params.activation); const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes( getTensorShape(input1()), getTensorShape(input2()), ¶ms); @@ -82,6 +84,28 @@ void Sub::evalFloat() const } } +template <typename T> void Sub::evalInteger() const +{ + tflite::ArithmeticParams params{}; + fillArithmeticActivationRange<T>(params, _params.activation); + + const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes( + getTensorShape(input1()), getTensorShape(input2()), ¶ms); + + if (need_broadcast) + { + tflite::reference_ops::BroadcastSubSlow( + params, getTensorShape(input1()), getTensorData<T>(input1()), getTensorShape(input2()), + getTensorData<T>(input2()), getTensorShape(output()), getTensorData<T>(output())); + } + else + { + tflite::reference_ops::Sub(params, getTensorShape(input1()), getTensorData<T>(input1()), + getTensorShape(input2()), getTensorData<T>(input2()), + getTensorShape(output()), getTensorData<T>(output())); + } +} + void Sub::evalQuantized() const { const auto input1_scale = static_cast<double>(input1()->scale()); diff --git a/compiler/luci-interpreter/src/kernels/Sub.h b/compiler/luci-interpreter/src/kernels/Sub.h index d7940b5c6..23952b3bd 100644 --- a/compiler/luci-interpreter/src/kernels/Sub.h +++ b/compiler/luci-interpreter/src/kernels/Sub.h @@ -39,6 +39,7 @@ public: private: void evalFloat() const; + template <typename T> void evalInteger() const; void evalQuantized() const; }; diff --git a/compiler/luci-interpreter/src/kernels/Sub.test.cpp b/compiler/luci-interpreter/src/kernels/Sub.test.cpp index c189f4481..9abafd49a 100644 --- a/compiler/luci-interpreter/src/kernels/Sub.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Sub.test.cpp @@ -162,6 +162,51 @@ TEST_F(SubTest, Float) } } +template <loco::DataType DType> void CheckInteger(luci_interpreter::IMemoryManager *memory_manager) +{ + using dtype = typename loco::DataTypeImpl<DType>::Type; + Shape base_shape = {2, 3, 1, 2}; + std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}}; + std::vector<std::vector<dtype>> test_outputs = { + {0, 1, 2, 3, 0, 0, 0, 0, 4, 1, 0, 0, 0, 0, 7, 0, 3, 0, + 0, 2, 4, 4, 0, 0, 3, 0, 10, 0, 6, 0, 3, 0, 10, 2, 6, 0}, + {0, 1, 4, 1, 3, 0, 0, 2, 10, 0, 6, 0}, + {0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 4, 3, 0, 0, 3, 0, 7, 0, + 2, 4, 0, 2, 0, 0, 8, 0, 6, 0, 1, 0, 8, 2, 6, 0, 1, 0}, + {0, 0, 0, 0, 7, 0, 2, 4, 6, 0, 1, 0}}; + std::vector<dtype> input1_data{-1, 2, 1, 0, 4, -5, 1, 3, 7, -1, 7, 1}; + std::vector<dtype> input2_data{4, 1, -3, -1, 1, 6}; + for (size_t i = 0; i < test_shapes.size(); ++i) + { + Tensor input1_tensor = makeInputTensor<DType>(base_shape, input1_data, memory_manager); + Tensor input2_tensor = makeInputTensor<DType>(test_shapes[i], input2_data, memory_manager); + Tensor output_tensor = makeOutputTensor(DType); + + SubParams params{}; + params.activation = Activation::RELU; + + Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params); + kernel.configure(); + memory_manager->allocate_memory(output_tensor); + kernel.execute(); + + EXPECT_THAT(extractTensorData<dtype>(output_tensor), test_outputs[i]) + << "With shape number " << i; + } +}; + +TEST_F(SubTest, SInt32) +{ + CheckInteger<loco::DataType::S32>(_memory_manager.get()); + SUCCEED(); +} + +TEST_F(SubTest, SInt64) +{ + CheckInteger<loco::DataType::S64>(_memory_manager.get()); + SUCCEED(); +} + TEST_F(SubTest, Input_Output_Type_NEG) { Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f}, _memory_manager.get()); @@ -175,11 +220,24 @@ TEST_F(SubTest, Input_Output_Type_NEG) EXPECT_ANY_THROW(kernel.configure()); } -TEST_F(SubTest, Invalid_Input_Type_NEG) +TEST_F(SubTest, Invalid_Output_Type_NEG) { Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1}, _memory_manager.get()); Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2}, _memory_manager.get()); - Tensor output_tensor = makeOutputTensor(DataType::S64); + Tensor output_tensor = makeOutputTensor(DataType::S32); + + SubParams params{}; + params.activation = Activation::RELU; + + Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params); + EXPECT_ANY_THROW(kernel.configure()); +} + +TEST_F(SubTest, Invalid_Input_Type_NEG) +{ + Tensor input1_tensor = makeInputTensor<DataType::U64>({1}, {1}, _memory_manager.get()); + Tensor input2_tensor = makeInputTensor<DataType::U64>({1}, {2}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::U64); SubParams params{}; params.activation = Activation::RELU; @@ -190,6 +248,19 @@ TEST_F(SubTest, Invalid_Input_Type_NEG) EXPECT_ANY_THROW(kernel.execute()); } +TEST_F(SubTest, Mismatching_Input_Int_Types_NEG) +{ + Tensor input1_tensor = makeInputTensor<DataType::S32>({1}, {1}, _memory_manager.get()); + Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2}, _memory_manager.get()); + Tensor output_tensor = makeOutputTensor(DataType::S32); + + SubParams params{}; + params.activation = Activation::NONE; + + Sub kernel(&input1_tensor, &input2_tensor, &output_tensor, params); + EXPECT_ANY_THROW(kernel.configure()); +} + } // namespace } // namespace kernels } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/kernels/Transpose.test.cpp b/compiler/luci-interpreter/src/kernels/Transpose.test.cpp index 107179910..43be8f8b9 100644 --- a/compiler/luci-interpreter/src/kernels/Transpose.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Transpose.test.cpp @@ -52,7 +52,7 @@ template <typename T> class TransposeTest : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t>; -TYPED_TEST_CASE(TransposeTest, DataTypes); +TYPED_TEST_SUITE(TransposeTest, DataTypes); TYPED_TEST(TransposeTest, Small3D) { diff --git a/compiler/luci-interpreter/src/kernels/Unpack.test.cpp b/compiler/luci-interpreter/src/kernels/Unpack.test.cpp index 4f22c9f30..9384ddc83 100644 --- a/compiler/luci-interpreter/src/kernels/Unpack.test.cpp +++ b/compiler/luci-interpreter/src/kernels/Unpack.test.cpp @@ -75,7 +75,7 @@ template <typename T> class UnpackTest : public ::testing::Test }; using DataTypes = ::testing::Types<float, uint8_t>; -TYPED_TEST_CASE(UnpackTest, DataTypes); +TYPED_TEST_SUITE(UnpackTest, DataTypes); TYPED_TEST(UnpackTest, ThreeOutputs) { diff --git a/compiler/luci-interpreter/src/kernels/Utils.cpp b/compiler/luci-interpreter/src/kernels/Utils.cpp index 586cfa1e1..5d8e5db83 100644 --- a/compiler/luci-interpreter/src/kernels/Utils.cpp +++ b/compiler/luci-interpreter/src/kernels/Utils.cpp @@ -27,17 +27,18 @@ namespace luci_interpreter namespace kernels { -void calculateActivationRange(Activation activation, float *activation_min, float *activation_max) +template <typename T> +void calculateActivationRange(Activation activation, T *activation_min, T *activation_max) { switch (activation) { case Activation::NONE: - *activation_min = std::numeric_limits<float>::lowest(); - *activation_max = std::numeric_limits<float>::max(); + *activation_min = std::numeric_limits<T>::lowest(); + *activation_max = std::numeric_limits<T>::max(); break; case Activation::RELU: *activation_min = 0; - *activation_max = std::numeric_limits<float>::max(); + *activation_max = std::numeric_limits<T>::max(); break; case Activation::RELU_N1_TO_1: *activation_min = -1; @@ -52,6 +53,13 @@ void calculateActivationRange(Activation activation, float *activation_min, floa } } +template void calculateActivationRange(Activation activation, float *activation_min, + float *activation_max); +template void calculateActivationRange(Activation activation, int32_t *activation_min, + int32_t *activation_max); +template void calculateActivationRange(Activation activation, int64_t *activation_min, + int64_t *activation_max); + static void calculateActivationRangeQuantizedImpl(Activation activation, int32_t qmin, int32_t qmax, const Tensor *output, int32_t *activation_min, int32_t *activation_max) @@ -175,7 +183,11 @@ Shape calculateShapeForBroadcast(const Shape &input1_shape, const Shape &input2_ { const int32_t input1_dim = i < num_input1_dims ? input1_shape.dim(num_input1_dims - i - 1) : 1; const int32_t input2_dim = i < num_input2_dims ? input2_shape.dim(num_input2_dims - i - 1) : 1; - assert(input1_dim == input2_dim || input1_dim == 1 || input2_dim == 1); + + bool need_broadcast = input1_dim != input2_dim; + bool can_broadcast = input1_dim == 1 || input2_dim == 1; + LUCI_INTERPRETER_CHECK(!need_broadcast || can_broadcast); + output_shape.dim(num_out_dims - i - 1) = std::max(input1_dim, input2_dim); } diff --git a/compiler/luci-interpreter/src/kernels/Utils.h b/compiler/luci-interpreter/src/kernels/Utils.h index 817a42f83..ebeb20e66 100644 --- a/compiler/luci-interpreter/src/kernels/Utils.h +++ b/compiler/luci-interpreter/src/kernels/Utils.h @@ -76,11 +76,42 @@ inline int32_t calcOffset(const Shape &shape, int32_t d0, int32_t d1, int32_t d2 return ((d0 * shape.dim(1) + d1) * shape.dim(2) + d2) * shape.dim(3) + d3; } -void calculateActivationRange(Activation activation, float *activation_min, float *activation_max); +template <typename T> +void calculateActivationRange(Activation activation, T *activation_min, T *activation_max); void calculateActivationRangeQuantized(Activation activation, const Tensor *output, int32_t *activation_min, int32_t *activation_max); +template <typename T> constexpr bool one_of_types() { return false; } + +// Checks if T is equal to one of {U,Other} types +template <typename T, typename U, typename... Other> constexpr bool one_of_types() +{ + return std::is_same<T, U>::value || one_of_types<T, Other...>(); +} + +/** + * Fills activation min and max parameters depending on given data type and activation + * + * T is a template parameter, so after optimization this code left with only required if case + * + * @tparam T data type of arithmetic operation output tensor + * @param params tflite params to fill + * @param activation luci_interpreter::Activation of arithmetic operation + */ +template <typename T> +void fillArithmeticActivationRange(tflite::ArithmeticParams &p, Activation act) +{ + static_assert(one_of_types<T, float, int32_t, int64_t>(), "Unsupported dtype"); + + if (std::is_same<T, float>::value) + calculateActivationRange(act, &p.float_activation_min, &p.float_activation_max); + if (std::is_same<T, int32_t>::value) + calculateActivationRange(act, &p.quantized_activation_min, &p.quantized_activation_max); + else + calculateActivationRange(act, &p.int64_activation_min, &p.int64_activation_max); +} + // Decompose a double multiplier into a Q0.31 int32 representation of its // significand, and shift representation of its exponent. // diff --git a/compiler/luci-interpreter/src/loader/CMakeLists.txt b/compiler/luci-interpreter/src/loader/CMakeLists.txt index 2cde99f5d..292771592 100644 --- a/compiler/luci-interpreter/src/loader/CMakeLists.txt +++ b/compiler/luci-interpreter/src/loader/CMakeLists.txt @@ -17,7 +17,9 @@ endmacro(REGISTER_KERNEL) include(${KERNEL_REGISTER_FILE}) add_library(${LUCI_INTERPRETER_LOADER} STATIC ${SOURCES}) -set_target_properties(${LUCI_INTERPRETER_LOADER} PROPERTIES POSITION_INDEPENDENT_CODE ON) +if (NOT NNCC_LIBRARY_NO_PIC) + set_target_properties(${LUCI_INTERPRETER_LOADER} PROPERTIES POSITION_INDEPENDENT_CODE ON) +endif(NOT NNCC_LIBRARY_NO_PIC) target_include_directories(${LUCI_INTERPRETER_LOADER} PUBLIC "${LUCI_INTERPRETER_PAL_DIR}") target_include_directories(${LUCI_INTERPRETER_LOADER} PUBLIC "${LUCI_INTERPRETER_SOURCE_DIR}") diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.cpp b/compiler/luci-interpreter/src/loader/GraphLoader.cpp index a14442ed5..dba39050c 100644 --- a/compiler/luci-interpreter/src/loader/GraphLoader.cpp +++ b/compiler/luci-interpreter/src/loader/GraphLoader.cpp @@ -73,6 +73,26 @@ const void *getNodeData(const luci::CircleConst *node, size_t *data_size) } } +const void *getNodeData(const luci::CircleCustom *node, size_t *data_size) +{ + if (node->custom_code() != "CircleReferencingConst") + return nullptr; + + // helper struct which describes data loaded to custom_options of CircleReferencingConst node + // TODO move this struct to header + struct ConstDataReference + { + const uint8_t *data = nullptr; + uint32_t size = 0; + }; + + const auto &custom_options = node->custom_options(); + const auto &const_data_ref = *reinterpret_cast<const ConstDataReference *>(custom_options.data()); + + *data_size = const_data_ref.size; + return const_data_ref.data; +} + bool isExecutableNode(const luci::CircleNode *node) { switch (node->opcode()) @@ -83,12 +103,30 @@ bool isExecutableNode(const luci::CircleNode *node) case luci::CircleOpcode::CIRCLEOUTPUT: case luci::CircleOpcode::CIRCLEOUTPUTEXCLUDE: // The following nodes denote outputs of multiple-output nodes. + case luci::CircleOpcode::CIRCLEBIDIRECTIONAL_SEQUENCE_LSTM_OUT: + case luci::CircleOpcode::CIRCLECUSTOMOUT: case luci::CircleOpcode::CIRCLEIFOUT: + case luci::CircleOpcode::CIRCLENONMAXSUPPRESSIONV4OUT: + case luci::CircleOpcode::CIRCLENONMAXSUPPRESSIONV5OUT: case luci::CircleOpcode::CIRCLESPLITOUT: case luci::CircleOpcode::CIRCLESPLITVOUT: + case luci::CircleOpcode::CIRCLETOPKV2OUT: + case luci::CircleOpcode::CIRCLEUNIQUEOUT: case luci::CircleOpcode::CIRCLEUNPACKOUT: + case luci::CircleOpcode::CIRCLEVARIABLE: case luci::CircleOpcode::CIRCLEWHILEOUT: return false; + // Custom nodes may be executable and non-executable + case luci::CircleOpcode::CUSTOM: + { + auto const custom_node = loco::must_cast<const luci::CircleCustom *>(node); + + // TODO handle more non-executable Custom ops here + if (custom_node->custom_code() == "CircleReferencingConst") + return false; + + return true; + } default: return true; } @@ -102,15 +140,34 @@ bool isTensorProducingNode(const luci::CircleNode *node) case luci::CircleOpcode::CIRCLEOUTPUT: // The following nodes are multiple-output nodes. They do not produce tensors, the tensors // are produced by the corresponding *Out nodes instead. + case luci::CircleOpcode::BIDIRECTIONAL_SEQUENCE_LSTM: + case luci::CircleOpcode::CUSTOM: case luci::CircleOpcode::IF: + case luci::CircleOpcode::NON_MAX_SUPPRESSION_V4: + case luci::CircleOpcode::NON_MAX_SUPPRESSION_V5: case luci::CircleOpcode::SPLIT: + case luci::CircleOpcode::SPLIT_V: + case luci::CircleOpcode::TOPK_V2: + case luci::CircleOpcode::UNIQUE: case luci::CircleOpcode::UNPACK: + case luci::CircleOpcode::WHILE: return false; default: return true; } } +bool isSupportedCustomNode(const luci::CircleNode *node) +{ + const auto custom_node = loco::must_cast<const luci::CircleCustom *>(node); + + // TODO handle more Custom ops here + if (custom_node->custom_code() == "CircleReferencingConst") + return true; + + return false; +} + } // namespace GraphLoader::GraphLoader( @@ -129,18 +186,25 @@ void GraphLoader::loadTensors() { const auto *node = loco::must_cast<const luci::CircleNode *>(_graph->nodes()->at(i)); + if (node->opcode() == luci::CircleOpcode::CUSTOM && !isSupportedCustomNode(node)) + throw std::runtime_error("Unknown Custom Node, yet."); + if (!isTensorProducingNode(node)) continue; - // Only Input and Const nodes have shapes. Shapes of intermediate tensors will be inferred. + // Only Input, Const, Custom and Variable nodes have shapes. Shapes of intermediate tensors will + // be inferred. Shape shape{}; - if (const auto *input_node = dynamic_cast<const luci::CircleInput *>(node)) + switch (node->opcode()) { - shape = getNodeShape(input_node); - } - else if (const auto *const_node = dynamic_cast<const luci::CircleConst *>(node)) - { - shape = getNodeShape(const_node); + case luci::CircleOpcode::CIRCLECONST: + case luci::CircleOpcode::CIRCLECUSTOMOUT: + case luci::CircleOpcode::CIRCLEINPUT: + case luci::CircleOpcode::CIRCLEVARIABLE: + shape = getNodeShape(node); + break; + default: + break; } AffineQuantization quantization; @@ -175,6 +239,22 @@ void GraphLoader::loadTensors() tensor->writeData(const_data, data_size); } } + else if (const auto *custom_out_node = dynamic_cast<const luci::CircleCustomOut *>(node)) + { + const auto *custom_node = + loco::must_cast<const luci::CircleCustom *>(custom_out_node->input()); + + if (custom_node->custom_code() == "CircleReferencingConst") + { + size_t data_size{}; + const void *const_data = getNodeData(custom_node, &data_size); + if (const_data != nullptr) + { + _memory_manager->allocate_memory(*tensor); + tensor->writeData(const_data, data_size); + } + } + } _node_to_tensor.emplace(node, tensor.get()); _runtime_to_ir.tensor_to_node.emplace(tensor.get(), node); diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp index 7a457a62f..b221b6921 100644 --- a/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp +++ b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp @@ -21,6 +21,7 @@ #include <kernels/Add.h> #include <kernels/ArgMax.h> #include <kernels/AveragePool2D.h> +#include <kernels/BatchMatMul.h> #include <kernels/Cast.h> #include <kernels/Concatenation.h> #include <kernels/Conv2D.h> @@ -54,6 +55,7 @@ #include <kernels/Mul.h> #include <kernels/Neg.h> #include <kernels/NotEqual.h> +#include <kernels/OneHot.h> #include <kernels/Pad.h> #include <kernels/PadV2.h> #include <kernels/Pow.h> @@ -209,6 +211,27 @@ TEST_F(KernelBuilderTest, AveragePool2D) EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction())); } +TEST_F(KernelBuilderTest, BatchMatMul) +{ + auto *lhs = createInputNode(); + auto *rhs = createInputNode(); + + auto *op = createNode<luci::CircleBatchMatMul>(); + op->x(lhs); + op->y(rhs); + op->adj_x(false); + op->adj_y(false); + + auto kernel = buildKernel<kernels::BatchMatMul>(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->x(), lhs); + checkTensor(kernel->y(), rhs); + checkTensor(kernel->output(), op); + EXPECT_THAT(kernel->params().adj_x, Eq(op->adj_x())); + EXPECT_THAT(kernel->params().adj_y, Eq(op->adj_y())); +} + TEST_F(KernelBuilderTest, Cast) { auto *input = createInputNode(); @@ -832,6 +855,31 @@ TEST_F(KernelBuilderTest, NotEqual) checkTensor(kernel->output(), op); } +TEST_F(KernelBuilderTest, OneHot) +{ + auto *indices = createInputNode(); + auto *depth = createInputNode(); + auto *on_value = createInputNode(); + auto *off_value = createInputNode(); + auto axis = 1; + + auto *op = createNode<luci::CircleOneHot>(); + op->indices(indices); + op->depth(depth); + op->on_value(on_value); + op->off_value(off_value); + op->axis(axis); + + auto kernel = buildKernel<kernels::OneHot>(op); + ASSERT_THAT(kernel, NotNull()); + + checkTensor(kernel->indices(), indices); + checkTensor(kernel->depth(), depth); + checkTensor(kernel->on_value(), on_value); + checkTensor(kernel->off_value(), off_value); + EXPECT_THAT(kernel->params().axis, Eq(op->axis())); +} + TEST_F(KernelBuilderTest, Pad) { auto *input = createInputNode(); diff --git a/compiler/luci-interpreter/src/loader/nodes/AveragePool2D.cpp b/compiler/luci-interpreter/src/loader/nodes/AveragePool2D.cpp index 5bc37bd4a..efb011257 100644 --- a/compiler/luci-interpreter/src/loader/nodes/AveragePool2D.cpp +++ b/compiler/luci-interpreter/src/loader/nodes/AveragePool2D.cpp @@ -17,6 +17,7 @@ #include "Builders.h" #include "kernels/AveragePool2D.h" +#include <luci/Plan/CircleNodeExecutionPlan.h> namespace luci_interpreter { @@ -40,7 +41,26 @@ std::unique_ptr<Kernel> build_kernel_CircleAveragePool2D(const luci::CircleNode params.stride_width = node->stride()->w(); params.activation = node->fusedActivationFunction(); - return std::make_unique<kernels::AveragePool2D>(input, output, params); + // It is unknown what data will be stored in scratchpad tensor, + // using UINT8 as a most general option + auto scratchpad = std::make_unique<Tensor>(DataType::U8, Shape({}), AffineQuantization{}, ""); + scratchpad->set_observable(false); + scratchpad->set_data_buffer(nullptr); + // If node has execution plan then read memory offsets for scratchpad temporary tensor + // from the beginning of shared memory buffer. + // Used in Static Memory Manager. + // TODO move tensors offset initialization to one place + if (luci::has_execution_plan(node)) + { + const auto execution_plan = luci::get_execution_plan(node); + // Check whether the offset for the current CircleConv2D temporary was found. + if (execution_plan.offsets().size() > 1) + // If this is true, then we keep this offset in scratchpad. + scratchpad->set_offset(execution_plan.offsets().at(1)); + } + Tensor *tmp = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad)); + + return std::make_unique<kernels::AveragePool2D>(input, output, tmp, params); } } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/loader/nodes/BatchMatMul.cpp b/compiler/luci-interpreter/src/loader/nodes/BatchMatMul.cpp new file mode 100644 index 000000000..aae3dbab1 --- /dev/null +++ b/compiler/luci-interpreter/src/loader/nodes/BatchMatMul.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Builders.h" + +#include "kernels/BatchMatMul.h" +#include <luci/Plan/CircleNodeExecutionPlan.h> + +namespace luci_interpreter +{ + +std::unique_ptr<Kernel> build_kernel_CircleBatchMatMul(const luci::CircleNode *circle_node, + KernelBuilderHelper &helper) +{ + const auto *node = dynamic_cast<const luci::CircleBatchMatMul *>(circle_node); + if (node == nullptr) + throw std::runtime_error("wrong builder for operation"); + assert(node->arity() == 2); + + const Tensor *lhs = helper.getInputTensor(node->x()); + const Tensor *rhs = helper.getInputTensor(node->y()); + Tensor *output = helper.getOutputTensor(node); + + auto lhs_scratchpad = + std::make_unique<Tensor>(lhs->element_type(), Shape({}), AffineQuantization{}, ""); + lhs_scratchpad->set_observable(false); + lhs_scratchpad->set_data_buffer(nullptr); + auto rhs_scratchpad = + std::make_unique<Tensor>(rhs->element_type(), Shape({}), AffineQuantization{}, ""); + rhs_scratchpad->set_observable(false); + rhs_scratchpad->set_data_buffer(nullptr); + // If node has execution plan then read memory offsets for scratchpad temporary tensor + // from the beginning of shared memory buffer. + // Used in Static Memory Manager. + // TODO move tensors offset initialization to one place + if (luci::has_execution_plan(node)) + { + const auto execution_plan = luci::get_execution_plan(node); + // Check whether the offset for the current BatchMatMul temporary was found. + if (execution_plan.offsets().size() > 1) + { + assert(execution_plan.offsets().size() == 3); + + // If this is true, then we keep this offset in scratchpad. + lhs_scratchpad->set_offset(execution_plan.offsets().at(1)); + rhs_scratchpad->set_offset(execution_plan.offsets().at(2)); + } + } + Tensor *lhs_tmp = helper.getRuntimeGraph(node->graph())->addTensor(std::move(lhs_scratchpad)); + Tensor *rhs_tmp = helper.getRuntimeGraph(node->graph())->addTensor(std::move(rhs_scratchpad)); + + BatchMatMulParams params; + params.adj_x = node->adj_x(); + params.adj_y = node->adj_y(); + + return std::make_unique<kernels::BatchMatMul>(lhs, rhs, output, lhs_tmp, rhs_tmp, params); +} + +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/loader/nodes/Conv2D.cpp b/compiler/luci-interpreter/src/loader/nodes/Conv2D.cpp index 22fd1aca4..b48d97d19 100644 --- a/compiler/luci-interpreter/src/loader/nodes/Conv2D.cpp +++ b/compiler/luci-interpreter/src/loader/nodes/Conv2D.cpp @@ -35,11 +35,12 @@ std::unique_ptr<Kernel> build_kernel_CircleConv2D(const luci::CircleNode *circle const Tensor *bias = helper.getOptionalInputTensor(node->bias()); Tensor *output = helper.getOutputTensor(node); - auto im2col = - std::make_unique<Tensor>(input->element_type(), Shape({}), AffineQuantization{}, ""); - im2col->set_observable(false); - im2col->set_data_buffer(nullptr); - // If node has execution plan then read memory offsets for im2col temporary tensor + // It is unknown what data will be stored in scratchpad tensor, + // using UINT8 as a most general option + auto scratchpad = std::make_unique<Tensor>(DataType::U8, Shape({}), AffineQuantization{}, ""); + scratchpad->set_observable(false); + scratchpad->set_data_buffer(nullptr); + // If node has execution plan then read memory offsets for scratchpad temporary tensor // from the beginning of shared memory buffer. // Used in Static Memory Manager. // TODO move tensors offset initialization to one place @@ -48,10 +49,10 @@ std::unique_ptr<Kernel> build_kernel_CircleConv2D(const luci::CircleNode *circle const auto execution_plan = luci::get_execution_plan(node); // Check whether the offset for the current CircleConv2D temporary was found. if (execution_plan.offsets().size() > 1) - // If this is true, then we keep this offset in im2col. - im2col->set_offset(execution_plan.offsets().at(1)); + // If this is true, then we keep this offset in scratchpad. + scratchpad->set_offset(execution_plan.offsets().at(1)); } - Tensor *tmp = helper.getRuntimeGraph(node->graph())->addTensor(std::move(im2col)); + Tensor *tmp = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad)); Conv2DParams params{}; params.padding = node->padding(); diff --git a/compiler/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp b/compiler/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp index c2f0346a2..db26ecf2e 100644 --- a/compiler/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp +++ b/compiler/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp @@ -17,6 +17,7 @@ #include "Builders.h" #include "kernels/DepthwiseConv2D.h" +#include <luci/Plan/CircleNodeExecutionPlan.h> namespace luci_interpreter { @@ -43,7 +44,26 @@ std::unique_ptr<Kernel> build_kernel_CircleDepthwiseConv2D(const luci::CircleNod params.dilation_width_factor = node->dilation()->w(); params.activation = node->fusedActivationFunction(); - return std::make_unique<kernels::DepthwiseConv2D>(input, filter, bias, output, params); + // It is unknown what data will be stored in scratchpad tensor, + // using UINT8 as a most general option + auto scratchpad = std::make_unique<Tensor>(DataType::U8, Shape({}), AffineQuantization{}, ""); + scratchpad->set_observable(false); + scratchpad->set_data_buffer(nullptr); + // If node has execution plan then read memory offsets for scratchpad temporary tensor + // from the beginning of shared memory buffer. + // Used in Static Memory Manager. + // TODO move tensors offset initialization to one place + if (luci::has_execution_plan(node)) + { + const auto execution_plan = luci::get_execution_plan(node); + // Check whether the offset for the current CircleConv2D temporary was found. + if (execution_plan.offsets().size() > 1) + // If this is true, then we keep this offset in scratchpad. + scratchpad->set_offset(execution_plan.offsets().at(1)); + } + Tensor *tmp = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad)); + + return std::make_unique<kernels::DepthwiseConv2D>(input, filter, bias, output, tmp, params); } } // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/loader/nodes/Dequantize.cpp b/compiler/luci-interpreter/src/loader/nodes/Dequantize.cpp new file mode 100644 index 000000000..4aae56469 --- /dev/null +++ b/compiler/luci-interpreter/src/loader/nodes/Dequantize.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Builders.h" + +#include "kernels/Dequantize.h" + +namespace luci_interpreter +{ + +std::unique_ptr<Kernel> build_kernel_CircleDequantize(const luci::CircleNode *circle_node, + KernelBuilderHelper &helper) +{ + const auto *node = dynamic_cast<const luci::CircleDequantize *>(circle_node); + if (node == nullptr) + throw std::runtime_error("wrong builder for operation"); + + const Tensor *input = helper.getInputTensor(node->input()); + Tensor *output = helper.getOutputTensor(node); + + return std::make_unique<kernels::Dequantize>(input, output); +} + +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/loader/nodes/ExpandDims.cpp b/compiler/luci-interpreter/src/loader/nodes/ExpandDims.cpp new file mode 100644 index 000000000..9840c34e5 --- /dev/null +++ b/compiler/luci-interpreter/src/loader/nodes/ExpandDims.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Builders.h" + +#include "kernels/ExpandDims.h" + +namespace luci_interpreter +{ + +std::unique_ptr<Kernel> build_kernel_CircleExpandDims(const luci::CircleNode *circle_node, + KernelBuilderHelper &helper) +{ + const auto *node = loco::must_cast<const luci::CircleExpandDims *>(circle_node); + assert(node->arity() == 2); + + const Tensor *input = helper.getInputTensor(node->input()); + const Tensor *axis = helper.getInputTensor(node->axis()); + Tensor *output = helper.getOutputTensor(node); + + return std::make_unique<kernels::ExpandDims>(input, axis, output); +} + +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/loader/nodes/FullyConnected.cpp b/compiler/luci-interpreter/src/loader/nodes/FullyConnected.cpp index 2917598fc..0b8ac44bd 100644 --- a/compiler/luci-interpreter/src/loader/nodes/FullyConnected.cpp +++ b/compiler/luci-interpreter/src/loader/nodes/FullyConnected.cpp @@ -36,6 +36,7 @@ std::unique_ptr<Kernel> build_kernel_CircleFullyConnected(const luci::CircleNode FullyConnectedParams params{}; params.activation = node->fusedActivationFunction(); + params.keep_num_dims = node->keep_num_dims(); return std::make_unique<kernels::FullyConnected>(input, weights, bias, output, params); } diff --git a/compiler/luci-interpreter/src/loader/nodes/Gather.cpp b/compiler/luci-interpreter/src/loader/nodes/Gather.cpp new file mode 100644 index 000000000..9df9775c5 --- /dev/null +++ b/compiler/luci-interpreter/src/loader/nodes/Gather.cpp @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Builders.h" + +#include "kernels/Gather.h" + +namespace luci_interpreter +{ + +std::unique_ptr<Kernel> build_kernel_CircleGather(const luci::CircleNode *circle_node, + KernelBuilderHelper &helper) +{ + const auto *node = dynamic_cast<const luci::CircleGather *>(circle_node); + if (node == nullptr) + throw std::runtime_error("wrong builder for operation"); + assert(node->arity() == 2); + + const Tensor *params = helper.getInputTensor(node->params()); + const Tensor *indices = helper.getInputTensor(node->indices()); + Tensor *output = helper.getOutputTensor(node); + + GatherParams gparams{}; + gparams.axis = node->axis(); + // TODO support batch_dims + gparams.batch_dims = 0; + + return std::make_unique<kernels::Gather>(params, indices, output, gparams); +} + +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/loader/nodes/OneHot.cpp b/compiler/luci-interpreter/src/loader/nodes/OneHot.cpp new file mode 100644 index 000000000..a40160945 --- /dev/null +++ b/compiler/luci-interpreter/src/loader/nodes/OneHot.cpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Builders.h" + +#include "kernels/OneHot.h" + +namespace luci_interpreter +{ + +std::unique_ptr<Kernel> build_kernel_CircleOneHot(const luci::CircleNode *circle_node, + KernelBuilderHelper &helper) +{ + const auto *node = loco::must_cast<const luci::CircleOneHot *>(circle_node); + assert(node->arity() == 4); + + const Tensor *indices = helper.getInputTensor(node->indices()); + const Tensor *depth = helper.getInputTensor(node->depth()); + const Tensor *on_value = helper.getInputTensor(node->on_value()); + const Tensor *off_value = helper.getInputTensor(node->off_value()); + Tensor *output = helper.getOutputTensor(node); + + OneHotParams params{}; + params.axis = node->axis(); + + return std::make_unique<kernels::OneHot>(indices, depth, on_value, off_value, output, params); +} + +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/loader/nodes/Quantize.cpp b/compiler/luci-interpreter/src/loader/nodes/Quantize.cpp new file mode 100644 index 000000000..fd9836345 --- /dev/null +++ b/compiler/luci-interpreter/src/loader/nodes/Quantize.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Builders.h" + +#include "kernels/Quantize.h" + +namespace luci_interpreter +{ + +std::unique_ptr<Kernel> build_kernel_CircleQuantize(const luci::CircleNode *circle_node, + KernelBuilderHelper &helper) +{ + const auto *node = dynamic_cast<const luci::CircleQuantize *>(circle_node); + if (node == nullptr) + throw std::runtime_error("wrong builder for operation"); + + const Tensor *input = helper.getInputTensor(node->input()); + Tensor *output = helper.getOutputTensor(node); + + return std::make_unique<kernels::Quantize>(input, output); +} + +} // namespace luci_interpreter diff --git a/compiler/luci-interpreter/src/loader/nodes/SVDF.cpp b/compiler/luci-interpreter/src/loader/nodes/SVDF.cpp new file mode 100644 index 000000000..89528d5ee --- /dev/null +++ b/compiler/luci-interpreter/src/loader/nodes/SVDF.cpp @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Builders.h" + +#include "kernels/SVDF.h" + +namespace luci_interpreter +{ + +std::unique_ptr<Kernel> build_kernel_CircleSVDF(const luci::CircleNode *circle_node, + KernelBuilderHelper &helper) +{ + const auto *node = dynamic_cast<const luci::CircleSVDF *>(circle_node); + if (node == nullptr) + throw std::runtime_error("wrong builder for operation"); + + const Tensor *input = helper.getInputTensor(node->input()); + const Tensor *feature = helper.getInputTensor(node->weight_feature()); + const Tensor *time = helper.getInputTensor(node->weight_time()); + const Tensor *bias = helper.getOptionalInputTensor(node->bias()); + const Tensor *input_activation_state = helper.getInputTensor(node->input_activation_state()); + Tensor *output = helper.getOutputTensor(node); + + auto scratchpad_tensor = std::make_unique<Tensor>(input_activation_state->element_type(), + Shape({}), AffineQuantization{}, ""); + scratchpad_tensor->set_observable(false); + scratchpad_tensor->set_data_buffer(nullptr); + Tensor *tmp = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad_tensor)); + + DataType data_type = input->element_type() == DataType::S8 ? DataType::S32 : DataType::FLOAT32; + + scratchpad_tensor = std::make_unique<Tensor>(data_type, Shape({}), AffineQuantization{}, ""); + scratchpad_tensor->set_observable(false); + scratchpad_tensor->set_data_buffer(nullptr); + Tensor *tmp_1 = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad_tensor)); + + if (data_type == DataType::FLOAT32 && + (feature->element_type() == DataType::S8 || feature->element_type() == DataType::U8)) + { + data_type = feature->element_type(); + } + + scratchpad_tensor = std::make_unique<Tensor>(data_type, Shape({}), AffineQuantization{}, ""); + scratchpad_tensor->set_observable(false); + scratchpad_tensor->set_data_buffer(nullptr); + Tensor *tmp_2 = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad_tensor)); + + data_type = DataType::FLOAT32; + + scratchpad_tensor = std::make_unique<Tensor>(data_type, Shape({}), AffineQuantization{}, ""); + scratchpad_tensor->set_observable(false); + scratchpad_tensor->set_data_buffer(nullptr); + Tensor *tmp_3 = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad_tensor)); + + scratchpad_tensor = std::make_unique<Tensor>(data_type, Shape({}), AffineQuantization{}, ""); + scratchpad_tensor->set_observable(false); + scratchpad_tensor->set_data_buffer(nullptr); + Tensor *tmp_4 = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad_tensor)); + + scratchpad_tensor = std::make_unique<Tensor>(data_type, Shape({}), AffineQuantization{}, ""); + scratchpad_tensor->set_observable(false); + scratchpad_tensor->set_data_buffer(nullptr); + Tensor *tmp_5 = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad_tensor)); + + scratchpad_tensor = std::make_unique<Tensor>(data_type, Shape({}), AffineQuantization{}, ""); + scratchpad_tensor->set_observable(false); + scratchpad_tensor->set_data_buffer(nullptr); + Tensor *tmp_6 = helper.getRuntimeGraph(node->graph())->addTensor(std::move(scratchpad_tensor)); + + SVDFParams params{}; + params.activation = node->fusedActivationFunction(); + params.svdf_rank = node->svdf_rank(); + params.asymmetric_quantize_inputs = node->asymmetric_quantize_inputs(); + + return std::make_unique<kernels::SVDF>(input, feature, time, bias, input_activation_state, output, + tmp, tmp_1, tmp_2, tmp_3, tmp_4, tmp_5, tmp_6, params); +} + +} // namespace luci_interpreter |