diff options
author | Chunseok Lee <chunseok.lee@samsung.com> | 2022-04-15 19:15:11 +0900 |
---|---|---|
committer | Chunseok Lee <chunseok.lee@samsung.com> | 2022-04-15 19:15:11 +0900 |
commit | 3ad689f0803519e343c36d5700646e86059df961 (patch) | |
tree | 862346c401a5577518fa7f042532aa931b53aa0e /compiler/circle-execution-plan | |
parent | ac6e4dd7b480e83b586ef533d7b29a8a97eb48fe (diff) | |
download | nnfw-3ad689f0803519e343c36d5700646e86059df961.tar.gz nnfw-3ad689f0803519e343c36d5700646e86059df961.tar.bz2 nnfw-3ad689f0803519e343c36d5700646e86059df961.zip |
Imported Upstream version 1.20.0upstream/1.20.0submit/tizen/20220415.103159
Diffstat (limited to 'compiler/circle-execution-plan')
10 files changed, 709 insertions, 91 deletions
diff --git a/compiler/circle-execution-plan/CMakeLists.txt b/compiler/circle-execution-plan/CMakeLists.txt index 115d24860..2f657c171 100644 --- a/compiler/circle-execution-plan/CMakeLists.txt +++ b/compiler/circle-execution-plan/CMakeLists.txt @@ -1,4 +1,9 @@ set(SOURCES + pal/IScratchpadHelper.h + pal/ScratchpadHelperLinux.h + pal/ScratchpadHelperMCU.h + pal/ScratchpadHelperCMSISNN.h + pal/TargetPlatform.h src/CircleExecutionPlan.cpp src/ExecutionPlanner.cpp src/ExecutionPlanner.h @@ -13,4 +18,5 @@ target_link_libraries(circle_execution_plan luci_export) target_link_libraries(circle_execution_plan luci_plan) target_link_libraries(circle_execution_plan arser) +target_include_directories(circle_execution_plan PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/pal") install(TARGETS circle_execution_plan DESTINATION bin) diff --git a/compiler/circle-execution-plan/README.md b/compiler/circle-execution-plan/README.md index e789a55db..dbb7d4f85 100644 --- a/compiler/circle-execution-plan/README.md +++ b/compiler/circle-execution-plan/README.md @@ -10,13 +10,12 @@ The output circle file contains plan (`CircleNodeMemoryPlan`) information for ev - number which determines order in which nodes will be executed - memory offsets for node output tensors from the beginning of shared memory buffer -In order to record and read this metadata, we use `CircleImportMetadata` and `CircleExportMetadata`. -For this purpose we use `std::map<uint32_t, std::vector<uint32_t>> _memory_plan_table` which for each node with key ID contains encoded `CircleNodeMemoryPlan` data. +In order to record and read this data, we use `luci::CircleNodeExecutionPlan`. ### Execution plan building In order to build "execution plan" we use `ExecutionPlanner` class. -The main method is `get_execution_plan()` which for each node finds and writes to its annotations +The main method is `make_execution_plan()` which for each node finds and writes to its annotations "execution plan". For this purpose there are two steps: - determining the order of execution of nodes, which is stored in `_ordered_nodes` vector. Now for this purpose there is only one default method `get_default_execution_order_plan()` that uses `loco::postorder_traversal(const std::vector<loco::Node *> &roots)`. diff --git a/compiler/circle-execution-plan/pal/IScratchpadHelper.h b/compiler/circle-execution-plan/pal/IScratchpadHelper.h new file mode 100644 index 000000000..f5a991526 --- /dev/null +++ b/compiler/circle-execution-plan/pal/IScratchpadHelper.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CIRCLE_EXECUTION_PLAN_ISRCRATCHPAD_HELPER_H +#define CIRCLE_EXECUTION_PLAN_ISRCRATCHPAD_HELPER_H + +#include <luci/IR/Nodes/CircleAveragePool2D.h> +#include <luci/IR/Nodes/CircleBatchMatMul.h> +#include <luci/IR/Nodes/CircleConv2D.h> +#include <luci/IR/Nodes/CircleDepthwiseConv2D.h> +#include <luci/IR/Nodes/CircleSVDF.h> +#include <cstdint> + +namespace circle_planner +{ + +class IScratchpadHelper +{ +public: + virtual uint32_t + ComputeScratchpadSizeAveragePool2d(const luci::CircleAveragePool2D *avg_pool) = 0; + + virtual std::vector<uint32_t> + ComputeScratchpadSizeBatchMatMul(const luci::CircleBatchMatMul *batch_mat_mul) = 0; + + virtual uint32_t ComputeScratchpadSizeConv2d(const luci::CircleConv2D *conv) = 0; + + virtual uint32_t + ComputeScratchpadSizeDepthwiseConv2d(const luci::CircleDepthwiseConv2D *depthwise_conv) = 0; + + virtual std::vector<uint32_t> ComputeScratchpadSizeSVDF(const luci::CircleSVDF *svdf) = 0; + + virtual ~IScratchpadHelper() = default; +}; + +} // namespace circle_planner + +#endif // CIRCLE_EXECUTION_PLAN_ISRCRATCHPAD_HELPER_H diff --git a/compiler/circle-execution-plan/pal/ScratchpadHelperCMSISNN.h b/compiler/circle-execution-plan/pal/ScratchpadHelperCMSISNN.h new file mode 100644 index 000000000..5369c0937 --- /dev/null +++ b/compiler/circle-execution-plan/pal/ScratchpadHelperCMSISNN.h @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_CMSISNN_H +#define CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_CMSISNN_H + +#include "IScratchpadHelper.h" +#include <cassert> + +namespace circle_planner +{ + +namespace +{ + +inline int32_t computePadding(int32_t stride, int32_t dilation_rate, int32_t in_size, + int32_t filter_size, int32_t out_size) +{ + const int32_t effective_filter_size = (filter_size - 1) * dilation_rate + 1; + const int32_t padding = ((out_size - 1) * stride + effective_filter_size - in_size) / 2; + return padding > 0 ? padding : 0; +} + +} // namespace + +class ScratchpadHelperCMSISNN : public IScratchpadHelper +{ +public: + explicit ScratchpadHelperCMSISNN(bool use_dsp) : _use_dsp(use_dsp) + { + // Do nothing + } + + uint32_t ComputeScratchpadSizeAveragePool2d(const luci::CircleAveragePool2D *avg_pool) final + { + // Main logic of arm_avgpool_s8_get_buffer_size + + const auto avg_pool_input = loco::must_cast<luci::CircleNode *>(avg_pool->value()); + + if (avg_pool_input->dtype() != loco::DataType::S8 or !_use_dsp) + return 0; + + const auto depth = static_cast<int32_t>(avg_pool_input->dim(3).value()); + + return depth * sizeof(int32_t); + } + + std::vector<uint32_t> + ComputeScratchpadSizeBatchMatMul(const luci::CircleBatchMatMul *batch_mat_mul) final + { + throw std::runtime_error("BatchMatMul is not currently supported for cmsisnn platform"); + } + + uint32_t ComputeScratchpadSizeConv2d(const luci::CircleConv2D *conv) final + { + // Main logic of arm_convolve_wrapper_s8_get_buffer_size + + const auto dilation_height_factor = static_cast<int32_t>(conv->dilation()->h()); + const auto dilation_width_factor = static_cast<int32_t>(conv->dilation()->w()); + + const auto conv_input = loco::must_cast<luci::CircleNode *>(conv->input()); + const auto filter = loco::must_cast<luci::CircleNode *>(conv->filter()); + + if (dilation_width_factor != 1 or dilation_height_factor != 1 or + conv_input->dtype() != loco::DataType::S8) + { + return 0; + } + + const auto input_depth = static_cast<int32_t>(conv_input->dim(3).value()); + + const auto input_height = static_cast<int32_t>(conv_input->dim(1).value()); + const auto input_width = static_cast<int32_t>(conv_input->dim(2).value()); + + const auto filter_height = static_cast<int32_t>(filter->dim(1).value()); + const auto filter_width = static_cast<int32_t>(filter->dim(2).value()); + + const auto stride_height = static_cast<int32_t>(conv->stride()->h()); + const auto stride_width = static_cast<int32_t>(conv->stride()->w()); + + const auto output_height = static_cast<int32_t>(conv->dim(1).value()); + const auto output_width = static_cast<int32_t>(conv->dim(2).value()); + + assert(conv_input->quantparam()->zerop.size() == 1); + assert(conv->quantparam()->zerop.size() == 1); + + const auto padding_height = computePadding(stride_height, dilation_height_factor, input_height, + filter_height, output_height); + const auto padding_width = + computePadding(stride_width, dilation_width_factor, input_width, filter_width, output_width); + + if ((padding_width == 0) && (padding_height == 0) && (input_depth % 4 == 0) && + (stride_width == 1) && (stride_height == 1) && (filter_width == 1) && (filter_height == 1)) + { + return 0; + } + + if (_use_dsp) + { + return (2 * input_depth * filter_width * filter_height) * sizeof(int16_t); + } + + return 0; + } + + uint32_t + ComputeScratchpadSizeDepthwiseConv2d(const luci::CircleDepthwiseConv2D *depthwise_conv) final + { + // Main logic of arm_depthwise_conv_wrapper_s8_get_buffer_size + + const auto dilation_height_factor = static_cast<int32_t>(depthwise_conv->dilation()->h()); + const auto dilation_width_factor = static_cast<int32_t>(depthwise_conv->dilation()->w()); + + const auto depthwise_conv_input = loco::must_cast<luci::CircleNode *>(depthwise_conv->input()); + const auto filter = loco::must_cast<luci::CircleNode *>(depthwise_conv->filter()); + + if (dilation_width_factor != 1 or dilation_height_factor != 1 or + depthwise_conv_input->dtype() != loco::DataType::S8) + { + return 0; + } + + const auto input_depth = static_cast<int32_t>(depthwise_conv_input->dim(3).value()); + const auto output_depth = static_cast<int32_t>(depthwise_conv->dim(3).value()); + const auto batch_size = static_cast<int32_t>(depthwise_conv_input->dim(0).value()); + + if (input_depth != output_depth or batch_size != 1 or !_use_dsp) + return 0; + + const auto filter_height = static_cast<int32_t>(filter->dim(1).value()); + const auto filter_width = static_cast<int32_t>(filter->dim(2).value()); + + return input_depth * filter_height * filter_width * sizeof(int16_t); + } + + std::vector<uint32_t> ComputeScratchpadSizeSVDF(const luci::CircleSVDF *svdf) final + { + const auto svdf_input = loco::must_cast<luci::CircleNode *>(svdf->input()); + const auto weight_feature_input = loco::must_cast<luci::CircleNode *>(svdf->weight_feature()); + + if (svdf_input->dtype() == loco::DataType::FLOAT32 and + (weight_feature_input->dtype() == loco::DataType::S8 or + weight_feature_input->dtype() == loco::DataType::U8)) + { + throw std::runtime_error("Hybrid type is not currently supported for linux platform"); + } + + std::vector<uint32_t> scratchpad_sizes; + + const auto batch_size = svdf_input->dim(0).value(); + const auto num_filters = weight_feature_input->dim(0).value(); + const auto rank = svdf->svdf_rank(); + const auto num_units = num_filters / rank; + + if (svdf_input->dtype() == loco::DataType::S8) + { + scratchpad_sizes.push_back(batch_size * num_filters * sizeof(int32_t)); + scratchpad_sizes.push_back(batch_size * num_units * sizeof(int32_t)); + } + else + { + scratchpad_sizes.push_back(batch_size * num_filters * sizeof(float)); + } + + return scratchpad_sizes; + } + +private: + bool _use_dsp; +}; + +} // namespace circle_planner + +#endif // CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_CMSISNN_H diff --git a/compiler/circle-execution-plan/pal/ScratchpadHelperLinux.h b/compiler/circle-execution-plan/pal/ScratchpadHelperLinux.h new file mode 100644 index 000000000..811aa67c3 --- /dev/null +++ b/compiler/circle-execution-plan/pal/ScratchpadHelperLinux.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_LINUX_H +#define CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_LINUX_H + +#include "IScratchpadHelper.h" +#include <loco/IR/DataTypeTraits.h> + +namespace circle_planner +{ + +class ScratchpadHelperLinux : public IScratchpadHelper +{ +public: + uint32_t ComputeScratchpadSizeAveragePool2d(const luci::CircleAveragePool2D *avg_pool) final + { + // for linux AveragePool2d scratchpad tensors size = 0 + return 0; + } + + std::vector<uint32_t> + ComputeScratchpadSizeBatchMatMul(const luci::CircleBatchMatMul *batch_mat_mul) final + { + const auto lhs = loco::must_cast<luci::CircleNode *>(batch_mat_mul->x()); + const auto rhs = loco::must_cast<luci::CircleNode *>(batch_mat_mul->y()); + + std::vector<uint32_t> scratchpad_sizes; + + // Scratchpad for lhs + uint32_t scratchpad_size = 1; + for (int32_t i = 0; i < lhs->rank(); ++i) + scratchpad_size *= lhs->dim(i).value(); + + scratchpad_sizes.push_back(scratchpad_size * loco::size(lhs->dtype())); + + // Scratchpad for rhs + scratchpad_size = 1; + for (int32_t i = 0; i < rhs->rank(); ++i) + scratchpad_size *= rhs->dim(i).value(); + + scratchpad_sizes.push_back(scratchpad_size * loco::size(rhs->dtype())); + + return scratchpad_sizes; + } + + uint32_t ComputeScratchpadSizeConv2d(const luci::CircleConv2D *conv) final + { + const auto conv_input = loco::must_cast<luci::CircleNode *>(conv->input()); + const auto filter = loco::must_cast<luci::CircleNode *>(conv->filter()); + + const uint32_t stride_height = conv->stride()->h(); + const uint32_t stride_width = conv->stride()->w(); + + const uint32_t dilation_height_factor = conv->dilation()->h(); + const uint32_t dilation_width_factor = conv->dilation()->w(); + + const uint32_t filter_height = filter->dim(1).value(); + const uint32_t filter_width = filter->dim(2).value(); + + const bool need_dilated_im2col = dilation_height_factor != 1 || dilation_width_factor != 1; + const bool need_non_dilated_im2col = + stride_height != 1 || stride_width != 1 || filter_height != 1 || filter_width != 1; + const bool need_im2col = conv_input->dtype() != loco::DataType::S16 && + (need_dilated_im2col || need_non_dilated_im2col); + + if (!need_im2col) + { + return 0; + } + + const uint32_t input_depth = conv_input->dim(3).value(); + const uint32_t batches = conv_input->dim(0).value(); + + const uint32_t output_height = conv->dim(1).value(); + const uint32_t output_width = conv->dim(2).value(); + + return batches * output_height * output_width * input_depth * filter_height * filter_width * + size(conv_input->dtype()); + } + + uint32_t + ComputeScratchpadSizeDepthwiseConv2d(const luci::CircleDepthwiseConv2D *depthwise_conv) final + { + // for linux DepthwiseConv2d scratchpad tensors size = 0 + return 0; + } + + std::vector<uint32_t> ComputeScratchpadSizeSVDF(const luci::CircleSVDF *svdf) final + { + const auto svdf_input = loco::must_cast<luci::CircleNode *>(svdf->input()); + const auto weight_feature_input = loco::must_cast<luci::CircleNode *>(svdf->weight_feature()); + + if (svdf_input->dtype() == loco::DataType::FLOAT32 and + (weight_feature_input->dtype() == loco::DataType::S8 or + weight_feature_input->dtype() == loco::DataType::U8)) + { + throw std::runtime_error("Hybrid type is not currently supported for linux platform"); + } + + std::vector<uint32_t> scratchpad_sizes; + + const auto batch_size = svdf_input->dim(0).value(); + const auto num_filters = weight_feature_input->dim(0).value(); + const auto rank = svdf->svdf_rank(); + const auto num_units = num_filters / rank; + + if (svdf_input->dtype() == loco::DataType::S8) + { + scratchpad_sizes.push_back(batch_size * num_filters * sizeof(int32_t)); + scratchpad_sizes.push_back(batch_size * num_units * sizeof(int32_t)); + } + else + { + scratchpad_sizes.push_back(batch_size * num_filters * sizeof(float)); + } + + return scratchpad_sizes; + } +}; + +} // namespace circle_planner + +#endif // CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_LINUX_H diff --git a/compiler/circle-execution-plan/pal/ScratchpadHelperMCU.h b/compiler/circle-execution-plan/pal/ScratchpadHelperMCU.h new file mode 100644 index 000000000..14b41640c --- /dev/null +++ b/compiler/circle-execution-plan/pal/ScratchpadHelperMCU.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_MCU_H +#define CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_MCU_H + +#include "IScratchpadHelper.h" + +namespace circle_planner +{ + +class ScratchpadHelperMCU : public IScratchpadHelper +{ +public: + uint32_t ComputeScratchpadSizeAveragePool2d(const luci::CircleAveragePool2D *avg_pool) final + { + // for mcu AveragePool2d scratchpad tensors size = 0 + return 0; + } + + std::vector<uint32_t> + ComputeScratchpadSizeBatchMatMul(const luci::CircleBatchMatMul *batch_mat_mul) final + { + throw std::runtime_error("BatchMatMul is not currently supported for mcu platform"); + } + + uint32_t ComputeScratchpadSizeConv2d(const luci::CircleConv2D *) final + { + // for mcu scratchpad size = 0 + return 0; + } + + uint32_t + ComputeScratchpadSizeDepthwiseConv2d(const luci::CircleDepthwiseConv2D *depthwise_conv) final + { + // for mcu DepthwiseConv2d scratchpad tensors size = 0 + return 0; + } + + std::vector<uint32_t> ComputeScratchpadSizeSVDF(const luci::CircleSVDF *svdf) final + { + const auto svdf_input = loco::must_cast<luci::CircleNode *>(svdf->input()); + const auto weight_feature_input = loco::must_cast<luci::CircleNode *>(svdf->weight_feature()); + + if (svdf_input->dtype() == loco::DataType::FLOAT32 and + (weight_feature_input->dtype() == loco::DataType::S8 or + weight_feature_input->dtype() == loco::DataType::U8)) + { + throw std::runtime_error("Hybrid type is not currently supported for linux platform"); + } + + std::vector<uint32_t> scratchpad_sizes; + + const auto batch_size = svdf_input->dim(0).value(); + const auto num_filters = weight_feature_input->dim(0).value(); + const auto rank = svdf->svdf_rank(); + const auto num_units = num_filters / rank; + + if (svdf_input->dtype() == loco::DataType::S8) + { + scratchpad_sizes.push_back(batch_size * num_filters * sizeof(int32_t)); + scratchpad_sizes.push_back(batch_size * num_units * sizeof(int32_t)); + } + else + { + scratchpad_sizes.push_back(batch_size * num_filters * sizeof(float)); + } + + return scratchpad_sizes; + } +}; + +} // namespace circle_planner + +#endif // CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_MCU_H diff --git a/compiler/circle-execution-plan/pal/TargetPlatform.h b/compiler/circle-execution-plan/pal/TargetPlatform.h new file mode 100644 index 000000000..538a502fe --- /dev/null +++ b/compiler/circle-execution-plan/pal/TargetPlatform.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CIRCLE_EXECUTION_PLAN_TARGET_PLATFORM_H +#define CIRCLE_EXECUTION_PLAN_TARGET_PLATFORM_H + +namespace circle_planner +{ + +enum SupportedPlatformType +{ + LINUX, + MCU, + CMSISNN +}; + +struct TargetPlatform +{ + SupportedPlatformType platform_type; + bool use_dsp; +}; + +} // namespace circle_planner + +#endif // CIRCLE_EXECUTION_PLAN_TARGET_PLATFORM_H diff --git a/compiler/circle-execution-plan/src/CircleExecutionPlan.cpp b/compiler/circle-execution-plan/src/CircleExecutionPlan.cpp index a54100b8c..1788124c3 100644 --- a/compiler/circle-execution-plan/src/CircleExecutionPlan.cpp +++ b/compiler/circle-execution-plan/src/CircleExecutionPlan.cpp @@ -35,6 +35,18 @@ int entry(int argc, char **argv) arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model"); arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model"); + arser.add_argument("--platform") + .nargs(1) + .type(arser::DataType::STR) + .required(false) + .default_value("linux") + .help("Platform name: linux mcu cmsisnn"); + arser.add_argument("--use_dsp") + .nargs(1) + .type(arser::DataType::BOOL) + .required(false) + .default_value(false) + .help("Plan with or without dsp (now can be used only with cmsisnn)"); try { @@ -47,8 +59,35 @@ int entry(int argc, char **argv) return 255; } - std::string input_path = arser.get<std::string>("input"); - std::string output_path = arser.get<std::string>("output"); + const std::string input_path = arser.get<std::string>("input"); + const std::string output_path = arser.get<std::string>("output"); + const std::string platform_name = arser.get<std::string>("--platform"); + const bool use_dsp = arser.get<bool>("--use_dsp"); + + if (platform_name != "cmsisnn" && use_dsp) + { + std::cerr << "ERROR: Now use_dsp can be used only with cmsisnn" << std::endl; + return EXIT_FAILURE; + } + + circle_planner::SupportedPlatformType platform_type; + if (platform_name == "linux") + { + platform_type = circle_planner::SupportedPlatformType::LINUX; + } + else if (platform_name == "mcu") + { + platform_type = circle_planner::SupportedPlatformType::MCU; + } + else if (platform_name == "cmsisnn") + { + platform_type = circle_planner::SupportedPlatformType::CMSISNN; + } + else + { + std::cerr << "ERROR: Invalid platform name '" << platform_name << "'" << std::endl; + return EXIT_FAILURE; + } foder::FileLoader file_loader{input_path}; std::vector<char> model_data; @@ -82,8 +121,8 @@ int entry(int argc, char **argv) auto module = importer.importModule(circle_model); // Do main job - luci::ExecutionPlanner execution_planner(module->graph()); - execution_planner.get_execution_plan(); + circle_planner::ExecutionPlanner execution_planner(module->graph(), {platform_type, use_dsp}); + execution_planner.make_execution_plan(); // Export to output Circle file luci::CircleExporter exporter; diff --git a/compiler/circle-execution-plan/src/ExecutionPlanner.cpp b/compiler/circle-execution-plan/src/ExecutionPlanner.cpp index c37d1e5f5..ec2ec1362 100644 --- a/compiler/circle-execution-plan/src/ExecutionPlanner.cpp +++ b/compiler/circle-execution-plan/src/ExecutionPlanner.cpp @@ -18,72 +18,49 @@ #include <loco/IR/Algorithm.h> #include <luci/UserSettings.h> -namespace luci +namespace circle_planner { namespace { -constexpr uint32_t nodeNotAssigned = std::numeric_limits<int32_t>::max(); +constexpr uint32_t node_not_assigned = std::numeric_limits<int32_t>::max(); -uint32_t compute_output_size(Padding padding, uint32_t image_size, uint32_t filter_size, - uint32_t stride, uint32_t dilation_rate = 1) +bool isExecutableNode(const luci::CircleNode *node) { - const int32_t effective_filter_size = (filter_size - 1) * dilation_rate + 1; - switch (padding) + switch (node->opcode()) { - case Padding::SAME: - return (image_size + stride - 1) / stride; - case Padding::VALID: - return (image_size + stride - effective_filter_size) / stride; + // The following nodes denote outputs of multiple-output nodes. + // The list is synchronized with the same list from luci-interpreter/src/loader/GraphLoader.cpp + case luci::CircleOpcode::CIRCLEIFOUT: + case luci::CircleOpcode::CIRCLESPLITOUT: + case luci::CircleOpcode::CIRCLESPLITVOUT: + case luci::CircleOpcode::CIRCLEUNPACKOUT: + case luci::CircleOpcode::CIRCLEWHILEOUT: + return false; default: - assert(false); + return true; } } -// Method finds (if necessary) size for im2col temporary tensor. -uint32_t compute_im2col_size(const luci::CircleConv2D *conv) +bool isTensorProducingNode(const luci::CircleNode *node) { - auto conv_input = loco::must_cast<luci::CircleNode *>(conv->input()); - auto filter = loco::must_cast<luci::CircleNode *>(conv->filter()); - auto padding = (conv->padding()); - uint32_t stride_height = conv->stride()->h(); - uint32_t stride_width = conv->stride()->w(); - - uint32_t dilation_height_factor = conv->dilation()->h(); - uint32_t dilation_width_factor = conv->dilation()->w(); - - uint32_t filter_height = filter->dim(1).value(); - uint32_t filter_width = filter->dim(2).value(); - - const bool need_dilated_im2col = dilation_height_factor != 1 || dilation_width_factor != 1; - const bool need_non_dilated_im2col = - stride_height != 1 || stride_width != 1 || filter_height != 1 || filter_width != 1; - bool need_im2col = - conv_input->dtype() != loco::DataType::S16 && (need_dilated_im2col || need_non_dilated_im2col); - - if (!need_im2col) + switch (node->opcode()) { - return 0; + // The following nodes are multiple-output nodes. They do not produce tensors, the tensors + // are produced by the corresponding *Out nodes instead. + // The list is synchronized with the same list from luci-interpreter/src/loader/GraphLoader.cpp + case luci::CircleOpcode::IF: + case luci::CircleOpcode::SPLIT: + case luci::CircleOpcode::UNPACK: + return false; + default: + return true; } - - uint32_t input_depth = conv_input->dim(3).value(); - uint32_t input_height = conv_input->dim(1).value(); - uint32_t input_width = conv_input->dim(2).value(); - - uint32_t output_height = compute_output_size(padding, input_height, filter_height, stride_height, - dilation_height_factor); - uint32_t output_width = - compute_output_size(padding, input_width, filter_width, stride_width, dilation_width_factor); - - uint32_t batches = conv_input->dim(0).value(); - - return batches * output_height * output_width * input_depth * filter_height * filter_width * - size(conv_input->dtype()); } } // namespace -void ExecutionPlanner::get_execution_plan() +void ExecutionPlanner::make_execution_plan() { get_default_execution_order_plan(); _required_size = get_offsets_with_greedy_by_size(); @@ -106,23 +83,23 @@ void ExecutionPlanner::get_default_execution_order_plan() void ExecutionPlanner::get_usage_interval() { // Initialize vectors of first and last nodes for usage interval - _alloc_node.assign(_ordered_nodes.size(), nodeNotAssigned); - _dealloc_node.assign(_ordered_nodes.size(), nodeNotAssigned); + _alloc_node.assign(_ordered_nodes.size(), node_not_assigned); + _dealloc_node.assign(_ordered_nodes.size(), node_not_assigned); // Vector for count usages std::vector<int> usages_counts(_ordered_nodes.size(), 0); auto allocate = [this](uint32_t node, uint32_t tensor) { - if (_alloc_node[tensor] != nodeNotAssigned) + if (_alloc_node[tensor] != node_not_assigned) { return; } - assert(_dealloc_node[tensor] == nodeNotAssigned); + assert(_dealloc_node[tensor] == node_not_assigned); _alloc_node[tensor] = node; }; auto deallocate = [this](uint32_t node, uint32_t tensor) { - assert(_dealloc_node[tensor] == nodeNotAssigned); + assert(_dealloc_node[tensor] == node_not_assigned); _dealloc_node[tensor] = node; }; @@ -158,13 +135,24 @@ void ExecutionPlanner::get_usage_interval() for (uint32_t i = 0; i < _ordered_nodes.size(); i++) { const auto node = _ordered_nodes.at(i); + auto prev_nodes = preds(node); if (const auto *const_node = dynamic_cast<const luci::CircleConst *>(node)) { allocate(0, i); } - allocate(i, i); + else if (!isExecutableNode(loco::must_cast<luci::CircleNode *>(node))) + { + // If current node is multi output node than begin life time for current node should start + // when prev node start live + auto it = std::find(_ordered_nodes.begin(), _ordered_nodes.end(), *prev_nodes.begin()); + size_t index = std::distance(_ordered_nodes.begin(), it); + allocate(index, i); + } + else + { + allocate(i, i); + } - auto prev_nodes = preds(node); for (auto &prev_node : prev_nodes) { auto it = std::find(_ordered_nodes.begin(), _ordered_nodes.end(), prev_node); @@ -203,7 +191,7 @@ uint32_t ExecutionPlanner::get_offsets_with_greedy_by_size() uint32_t ExecutionPlanner::greedy_by_size_approach() { size_t result_size = 0; - create_alloc_node_inform_vector(false, false, false); + create_alloc_node_inform_vector(_is_null_consts, _is_null_inputs, _is_null_scratchpads); std::vector<AllocationNodeInformation> ordered_alloc_inform; for (auto ¤t_node : _alloc_node_inform_vector) { @@ -250,22 +238,22 @@ uint32_t ExecutionPlanner::greedy_by_size_approach() } void ExecutionPlanner::create_alloc_node_inform_vector(bool null_consts, bool null_inputs, - bool null_im2col) + bool null_scratchpad) { auto node_compare = [this](const AllocationNodeInformation &alloc_1, const AllocationNodeInformation &alloc_2) { auto idx1 = alloc_1.node_num; auto idx2 = alloc_2.node_num; - if (this->_alloc_node[idx1] == 0 && this->_dealloc_node[idx1] == nodeNotAssigned) + if (this->_alloc_node[idx1] == 0 && this->_dealloc_node[idx1] == node_not_assigned) { - if (this->_alloc_node[idx2] == 0 && this->_dealloc_node[idx2] == nodeNotAssigned) + if (this->_alloc_node[idx2] == 0 && this->_dealloc_node[idx2] == node_not_assigned) { return idx1 < idx2; } return true; } - if (this->_alloc_node[idx2] == 0 && this->_dealloc_node[idx2] == nodeNotAssigned) + if (this->_alloc_node[idx2] == 0 && this->_dealloc_node[idx2] == node_not_assigned) { return false; } @@ -305,30 +293,66 @@ void ExecutionPlanner::create_alloc_node_inform_vector(bool null_consts, bool nu { _alloc_node_inform_vector[i].size = 0; } + else if (!isTensorProducingNode(circle_node)) + { + _alloc_node_inform_vector[i].size = 0; + } else { _alloc_node_inform_vector[i].size = node_size; } - // Im2col - auto opcode = circle_node->opcode(); - if (opcode == luci::CircleOpcode::CONV_2D) + // Scratchpad If needed + std::vector<uint32_t> scratchpad_sizes; + if (!null_scratchpad) { - auto conv = loco::must_cast<const luci::CircleConv2D *>(circle_node); - auto im2col_size = compute_im2col_size(conv); - if (im2col_size > 0) + switch (circle_node->opcode()) { - AllocationNodeInformation temp_alloc; - - if (null_im2col) + case luci::CircleOpcode::AVERAGE_POOL_2D: { - temp_alloc.size = 0; + const auto avg_pool = loco::must_cast<const luci::CircleAveragePool2D *>(circle_node); + scratchpad_sizes.push_back( + _scratchpad_helper->ComputeScratchpadSizeAveragePool2d(avg_pool)); + break; } - else + case luci::CircleOpcode::BATCH_MATMUL: { - temp_alloc.size = im2col_size; + const auto batch_mat_mul = loco::must_cast<const luci::CircleBatchMatMul *>(circle_node); + scratchpad_sizes = _scratchpad_helper->ComputeScratchpadSizeBatchMatMul(batch_mat_mul); + break; } + case luci::CircleOpcode::CONV_2D: + { + const auto conv = loco::must_cast<const luci::CircleConv2D *>(circle_node); + scratchpad_sizes.push_back(_scratchpad_helper->ComputeScratchpadSizeConv2d(conv)); + break; + } + case luci::CircleOpcode::DEPTHWISE_CONV_2D: + { + const auto depthwise_conv = + loco::must_cast<const luci::CircleDepthwiseConv2D *>(circle_node); + scratchpad_sizes.push_back( + _scratchpad_helper->ComputeScratchpadSizeDepthwiseConv2d(depthwise_conv)); + break; + } + case luci::CircleOpcode::SVDF: + { + const auto svdf = loco::must_cast<const luci::CircleSVDF *>(circle_node); + scratchpad_sizes = _scratchpad_helper->ComputeScratchpadSizeSVDF(svdf); + break; + } + default: + break; + } + } + + for (const auto scratchpad_size : scratchpad_sizes) + { + if (scratchpad_size > 0) + { + AllocationNodeInformation temp_alloc; + temp_alloc.size = scratchpad_size; temp_alloc.first_node = i - 1; temp_alloc.last_node = i + 1; temp_alloc.node_num = i; @@ -352,7 +376,7 @@ void ExecutionPlanner::dump_inform() { auto current_node_it = std::find_if( _alloc_node_inform_vector.begin(), _alloc_node_inform_vector.end(), - [this, i](const AllocationNodeInformation &x) { return x.node_num == i && !x.is_temp; }); + [i](const AllocationNodeInformation &x) { return x.node_num == i && !x.is_temp; }); for (uint32_t j = 0; j < _ordered_nodes.size(); j++) { auto first_node = _alloc_node[j]; @@ -360,7 +384,7 @@ void ExecutionPlanner::dump_inform() auto it = std::find_if( _alloc_node_inform_vector.begin(), _alloc_node_inform_vector.end(), - [this, j](const AllocationNodeInformation &x) { return x.node_num == j && !x.is_temp; }); + [j](const AllocationNodeInformation &x) { return x.node_num == j && !x.is_temp; }); if (i >= first_node && i <= last_node) { current_node_it->breadth += it->size; @@ -386,4 +410,4 @@ void ExecutionPlanner::dump_inform() }); } -} // namespace luci +} // namespace circle_planner diff --git a/compiler/circle-execution-plan/src/ExecutionPlanner.h b/compiler/circle-execution-plan/src/ExecutionPlanner.h index 8e3d9b46a..e0833c407 100644 --- a/compiler/circle-execution-plan/src/ExecutionPlanner.h +++ b/compiler/circle-execution-plan/src/ExecutionPlanner.h @@ -17,10 +17,15 @@ #ifndef CIRCLE_EXECUTION_PLANNER_H #define CIRCLE_EXECUTION_PLANNER_H +#include "TargetPlatform.h" +#include "IScratchpadHelper.h" +#include "ScratchpadHelperLinux.h" +#include "ScratchpadHelperMCU.h" +#include "ScratchpadHelperCMSISNN.h" #include <luci/IR/Module.h> #include <luci/Plan/CircleNodeExecutionPlan.h> -namespace luci +namespace circle_planner { // struct for additional information for the node. it helps build allocations plan for nodes. struct AllocationNodeInformation @@ -50,7 +55,7 @@ struct AllocationNodeInformation uint32_t last_node; // is the current node temporary or not bool is_temp; - // operation breadth of current node + // Breadth is a sum of live tensors sizes at the moment of execution of given node uint32_t breadth; bool operator<(const AllocationNodeInformation &other) const { return offset < other.offset; } @@ -60,12 +65,44 @@ class ExecutionPlanner { public: ExecutionPlanner() = delete; - explicit ExecutionPlanner(loco::Graph *graph) { _graph = graph; }; + explicit ExecutionPlanner(loco::Graph *graph) : _graph(graph) + { + _scratchpad_helper = std::make_unique<ScratchpadHelperLinux>(); + } + + explicit ExecutionPlanner(loco::Graph *graph, TargetPlatform target_platform) : _graph(graph) + { + switch (target_platform.platform_type) + { + case LINUX: + _scratchpad_helper = std::make_unique<ScratchpadHelperLinux>(); + break; + case MCU: + _scratchpad_helper = std::make_unique<ScratchpadHelperMCU>(); + break; + case CMSISNN: + _scratchpad_helper = std::make_unique<ScratchpadHelperCMSISNN>(target_platform.use_dsp); + break; + default: + assert(false && "Use unsupported platform"); + } + }; // Method provides execution plan, which contains execution order and // memory offsets for all nodes in _graph. // This plan writes in nodes annotation information with help of CircleNodeExecutionPlan class. - void get_execution_plan(); + void make_execution_plan(); + + // Method change planning mode: + // is_null_consts = true - constants are no longer taken into account when planning + // is_null_inputs = true - input are no longer taken into account when planning + // is_null_scratchpads = true - scratchpads are no longer taken into account when planning + void change_planning_mode(bool is_null_consts, bool is_null_inputs, bool is_null_scratchpads) + { + _is_null_consts = is_null_consts; + _is_null_inputs = is_null_inputs; + _is_null_scratchpads = is_null_scratchpads; + }; private: // Method gets default execution order plan and saves it in _ordered_nodes vector. @@ -83,18 +120,19 @@ private: // Return: required size of buffer. uint32_t get_offsets_with_greedy_by_size(); - // Realization of greedy by size approach to find offsets for nodes. + // Realization of greedy by size approach (algorithm is mentioned in + // "EFFICIENT MEMORY MANAGEMENT FOR DEEP NEURAL NET INFERENCE" paper) to find offsets for nodes. uint32_t greedy_by_size_approach(); // Method creates and fills _alloc_node_inform_vector with usage interval inform and node's sizes. // null_consts = true - size of const nodes will be equal 0; // null_inputs = true - size of input nodes will be equal 0; - // null_im2col = true - size of im2col nodes will be equal 0; - // It using if we don't want to take input(const or im2col) nodes into account + // null_scratchpad = true - size of scratchpad nodes will be equal 0; + // It using if we don't want to take input(const or scratchpads) nodes into account // when determining offsets and calculating the required buffer size. This is uses for // experiments. void create_alloc_node_inform_vector(bool null_consts = false, bool null_inputs = false, - bool null_im2col = false); + bool null_scratchpad = false); // Stores allocation additional information for the all nodes from _graph. std::vector<AllocationNodeInformation> _alloc_node_inform_vector; @@ -121,10 +159,21 @@ private: loco::Graph *_graph; + // Calculate size of scratchpad tensors for current platform + std::unique_ptr<IScratchpadHelper> _scratchpad_helper; + // Required memory size. uint32_t _required_size = 0; + + // Flags for choosing different planning modes: + // _is_null_consts = true - constants are no longer taken into account when planning + // _is_null_inputs = true - input are no longer taken into account when planning + // _is_null_scratchpads = true - scratchpads are no longer taken into account when planning + bool _is_null_consts = false; + bool _is_null_inputs = false; + bool _is_null_scratchpads = false; }; -} // namespace luci +} // namespace circle_planner #endif // CIRCLE_EXECUTION_PLANNER_H |