Imported Upstream version 1.20.0upstream/1.20.0 submit/tizen/20220415.103159

author: Chunseok Lee <chunseok.lee@samsung.com> 2022-04-15 19:15:11 +0900
committer: Chunseok Lee <chunseok.lee@samsung.com> 2022-04-15 19:15:11 +0900
commit: 3ad689f0803519e343c36d5700646e86059df961 (patch)
tree: 862346c401a5577518fa7f042532aa931b53aa0e /compiler/circle-execution-plan
parent: ac6e4dd7b480e83b586ef533d7b29a8a97eb48fe (diff)
download: nnfw-3ad689f0803519e343c36d5700646e86059df961.tar.gz
nnfw-3ad689f0803519e343c36d5700646e86059df961.tar.bz2
nnfw-3ad689f0803519e343c36d5700646e86059df961.zip
10 files changed, 709 insertions, 91 deletions
diff --git a/compiler/circle-execution-plan/CMakeLists.txt b/compiler/circle-execution-plan/CMakeLists.txt
index 115d24860..2f657c171 100644
--- a/compiler/circle-execution-plan/CMakeLists.txt
+++ b/compiler/circle-execution-plan/CMakeLists.txt
@@ -1,4 +1,9 @@
 set(SOURCES
+        pal/IScratchpadHelper.h
+        pal/ScratchpadHelperLinux.h
+        pal/ScratchpadHelperMCU.h
+        pal/ScratchpadHelperCMSISNN.h
+        pal/TargetPlatform.h
         src/CircleExecutionPlan.cpp
         src/ExecutionPlanner.cpp
         src/ExecutionPlanner.h
@@ -13,4 +18,5 @@ target_link_libraries(circle_execution_plan luci_export)
 target_link_libraries(circle_execution_plan luci_plan)
 target_link_libraries(circle_execution_plan arser)
 
+target_include_directories(circle_execution_plan PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/pal")
 install(TARGETS circle_execution_plan DESTINATION bin)
diff --git a/compiler/circle-execution-plan/README.md b/compiler/circle-execution-plan/README.md
index e789a55db..dbb7d4f85 100644
--- a/compiler/circle-execution-plan/README.md
+++ b/compiler/circle-execution-plan/README.md
@@ -10,13 +10,12 @@ The output circle file contains plan (`CircleNodeMemoryPlan`) information for ev
 - number which determines order in which nodes will be executed
 - memory offsets for node output tensors from the beginning of shared memory buffer
 
-In order to record and read this metadata, we use `CircleImportMetadata` and `CircleExportMetadata`.
-For this purpose we use `std::map<uint32_t, std::vector<uint32_t>> _memory_plan_table` which for each node with key ID contains encoded `CircleNodeMemoryPlan` data.
+In order to record and read this data, we use `luci::CircleNodeExecutionPlan`.
 
 ### Execution plan building
 
 In order to build "execution plan" we use `ExecutionPlanner` class.
-The main method is `get_execution_plan()` which for each node finds and writes to its annotations 
+The main method is `make_execution_plan()` which for each node finds and writes to its annotations 
 "execution plan". For this purpose there are two steps:
 - determining the order of execution of nodes, which is stored in `_ordered_nodes` vector.
 Now for this purpose there is only one default method `get_default_execution_order_plan()` that uses `loco::postorder_traversal(const std::vector<loco::Node *> &roots)`.
diff --git a/compiler/circle-execution-plan/pal/IScratchpadHelper.h b/compiler/circle-execution-plan/pal/IScratchpadHelper.h
new file mode 100644
index 000000000..f5a991526
--- /dev/null
+++ b/compiler/circle-execution-plan/pal/IScratchpadHelper.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CIRCLE_EXECUTION_PLAN_ISRCRATCHPAD_HELPER_H
+#define CIRCLE_EXECUTION_PLAN_ISRCRATCHPAD_HELPER_H
+
+#include <luci/IR/Nodes/CircleAveragePool2D.h>
+#include <luci/IR/Nodes/CircleBatchMatMul.h>
+#include <luci/IR/Nodes/CircleConv2D.h>
+#include <luci/IR/Nodes/CircleDepthwiseConv2D.h>
+#include <luci/IR/Nodes/CircleSVDF.h>
+#include <cstdint>
+
+namespace circle_planner
+{
+
+class IScratchpadHelper
+{
+public:
+  virtual uint32_t
+  ComputeScratchpadSizeAveragePool2d(const luci::CircleAveragePool2D *avg_pool) = 0;
+
+  virtual std::vector<uint32_t>
+  ComputeScratchpadSizeBatchMatMul(const luci::CircleBatchMatMul *batch_mat_mul) = 0;
+
+  virtual uint32_t ComputeScratchpadSizeConv2d(const luci::CircleConv2D *conv) = 0;
+
+  virtual uint32_t
+  ComputeScratchpadSizeDepthwiseConv2d(const luci::CircleDepthwiseConv2D *depthwise_conv) = 0;
+
+  virtual std::vector<uint32_t> ComputeScratchpadSizeSVDF(const luci::CircleSVDF *svdf) = 0;
+
+  virtual ~IScratchpadHelper() = default;
+};
+
+} // namespace circle_planner
+
+#endif // CIRCLE_EXECUTION_PLAN_ISRCRATCHPAD_HELPER_H
diff --git a/compiler/circle-execution-plan/pal/ScratchpadHelperCMSISNN.h b/compiler/circle-execution-plan/pal/ScratchpadHelperCMSISNN.h
new file mode 100644
index 000000000..5369c0937
--- /dev/null
+++ b/compiler/circle-execution-plan/pal/ScratchpadHelperCMSISNN.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_CMSISNN_H
+#define CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_CMSISNN_H
+
+#include "IScratchpadHelper.h"
+#include <cassert>
+
+namespace circle_planner
+{
+
+namespace
+{
+
+inline int32_t computePadding(int32_t stride, int32_t dilation_rate, int32_t in_size,
+                              int32_t filter_size, int32_t out_size)
+{
+  const int32_t effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+  const int32_t padding = ((out_size - 1) * stride + effective_filter_size - in_size) / 2;
+  return padding > 0 ? padding : 0;
+}
+
+} // namespace
+
+class ScratchpadHelperCMSISNN : public IScratchpadHelper
+{
+public:
+  explicit ScratchpadHelperCMSISNN(bool use_dsp) : _use_dsp(use_dsp)
+  {
+    // Do nothing
+  }
+
+  uint32_t ComputeScratchpadSizeAveragePool2d(const luci::CircleAveragePool2D *avg_pool) final
+  {
+    // Main logic of arm_avgpool_s8_get_buffer_size
+
+    const auto avg_pool_input = loco::must_cast<luci::CircleNode *>(avg_pool->value());
+
+    if (avg_pool_input->dtype() != loco::DataType::S8 or !_use_dsp)
+      return 0;
+
+    const auto depth = static_cast<int32_t>(avg_pool_input->dim(3).value());
+
+    return depth * sizeof(int32_t);
+  }
+
+  std::vector<uint32_t>
+  ComputeScratchpadSizeBatchMatMul(const luci::CircleBatchMatMul *batch_mat_mul) final
+  {
+    throw std::runtime_error("BatchMatMul is not currently supported for cmsisnn platform");
+  }
+
+  uint32_t ComputeScratchpadSizeConv2d(const luci::CircleConv2D *conv) final
+  {
+    // Main logic of arm_convolve_wrapper_s8_get_buffer_size
+
+    const auto dilation_height_factor = static_cast<int32_t>(conv->dilation()->h());
+    const auto dilation_width_factor = static_cast<int32_t>(conv->dilation()->w());
+
+    const auto conv_input = loco::must_cast<luci::CircleNode *>(conv->input());
+    const auto filter = loco::must_cast<luci::CircleNode *>(conv->filter());
+
+    if (dilation_width_factor != 1 or dilation_height_factor != 1 or
+        conv_input->dtype() != loco::DataType::S8)
+    {
+      return 0;
+    }
+
+    const auto input_depth = static_cast<int32_t>(conv_input->dim(3).value());
+
+    const auto input_height = static_cast<int32_t>(conv_input->dim(1).value());
+    const auto input_width = static_cast<int32_t>(conv_input->dim(2).value());
+
+    const auto filter_height = static_cast<int32_t>(filter->dim(1).value());
+    const auto filter_width = static_cast<int32_t>(filter->dim(2).value());
+
+    const auto stride_height = static_cast<int32_t>(conv->stride()->h());
+    const auto stride_width = static_cast<int32_t>(conv->stride()->w());
+
+    const auto output_height = static_cast<int32_t>(conv->dim(1).value());
+    const auto output_width = static_cast<int32_t>(conv->dim(2).value());
+
+    assert(conv_input->quantparam()->zerop.size() == 1);
+    assert(conv->quantparam()->zerop.size() == 1);
+
+    const auto padding_height = computePadding(stride_height, dilation_height_factor, input_height,
+                                               filter_height, output_height);
+    const auto padding_width =
+      computePadding(stride_width, dilation_width_factor, input_width, filter_width, output_width);
+
+    if ((padding_width == 0) && (padding_height == 0) && (input_depth % 4 == 0) &&
+        (stride_width == 1) && (stride_height == 1) && (filter_width == 1) && (filter_height == 1))
+    {
+      return 0;
+    }
+
+    if (_use_dsp)
+    {
+      return (2 * input_depth * filter_width * filter_height) * sizeof(int16_t);
+    }
+
+    return 0;
+  }
+
+  uint32_t
+  ComputeScratchpadSizeDepthwiseConv2d(const luci::CircleDepthwiseConv2D *depthwise_conv) final
+  {
+    // Main logic of arm_depthwise_conv_wrapper_s8_get_buffer_size
+
+    const auto dilation_height_factor = static_cast<int32_t>(depthwise_conv->dilation()->h());
+    const auto dilation_width_factor = static_cast<int32_t>(depthwise_conv->dilation()->w());
+
+    const auto depthwise_conv_input = loco::must_cast<luci::CircleNode *>(depthwise_conv->input());
+    const auto filter = loco::must_cast<luci::CircleNode *>(depthwise_conv->filter());
+
+    if (dilation_width_factor != 1 or dilation_height_factor != 1 or
+        depthwise_conv_input->dtype() != loco::DataType::S8)
+    {
+      return 0;
+    }
+
+    const auto input_depth = static_cast<int32_t>(depthwise_conv_input->dim(3).value());
+    const auto output_depth = static_cast<int32_t>(depthwise_conv->dim(3).value());
+    const auto batch_size = static_cast<int32_t>(depthwise_conv_input->dim(0).value());
+
+    if (input_depth != output_depth or batch_size != 1 or !_use_dsp)
+      return 0;
+
+    const auto filter_height = static_cast<int32_t>(filter->dim(1).value());
+    const auto filter_width = static_cast<int32_t>(filter->dim(2).value());
+
+    return input_depth * filter_height * filter_width * sizeof(int16_t);
+  }
+
+  std::vector<uint32_t> ComputeScratchpadSizeSVDF(const luci::CircleSVDF *svdf) final
+  {
+    const auto svdf_input = loco::must_cast<luci::CircleNode *>(svdf->input());
+    const auto weight_feature_input = loco::must_cast<luci::CircleNode *>(svdf->weight_feature());
+
+    if (svdf_input->dtype() == loco::DataType::FLOAT32 and
+        (weight_feature_input->dtype() == loco::DataType::S8 or
+         weight_feature_input->dtype() == loco::DataType::U8))
+    {
+      throw std::runtime_error("Hybrid type is not currently supported for linux platform");
+    }
+
+    std::vector<uint32_t> scratchpad_sizes;
+
+    const auto batch_size = svdf_input->dim(0).value();
+    const auto num_filters = weight_feature_input->dim(0).value();
+    const auto rank = svdf->svdf_rank();
+    const auto num_units = num_filters / rank;
+
+    if (svdf_input->dtype() == loco::DataType::S8)
+    {
+      scratchpad_sizes.push_back(batch_size * num_filters * sizeof(int32_t));
+      scratchpad_sizes.push_back(batch_size * num_units * sizeof(int32_t));
+    }
+    else
+    {
+      scratchpad_sizes.push_back(batch_size * num_filters * sizeof(float));
+    }
+
+    return scratchpad_sizes;
+  }
+
+private:
+  bool _use_dsp;
+};
+
+} // namespace circle_planner
+
+#endif // CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_CMSISNN_H
diff --git a/compiler/circle-execution-plan/pal/ScratchpadHelperLinux.h b/compiler/circle-execution-plan/pal/ScratchpadHelperLinux.h
new file mode 100644
index 000000000..811aa67c3
--- /dev/null
+++ b/compiler/circle-execution-plan/pal/ScratchpadHelperLinux.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_LINUX_H
+#define CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_LINUX_H
+
+#include "IScratchpadHelper.h"
+#include <loco/IR/DataTypeTraits.h>
+
+namespace circle_planner
+{
+
+class ScratchpadHelperLinux : public IScratchpadHelper
+{
+public:
+  uint32_t ComputeScratchpadSizeAveragePool2d(const luci::CircleAveragePool2D *avg_pool) final
+  {
+    // for linux AveragePool2d scratchpad tensors size = 0
+    return 0;
+  }
+
+  std::vector<uint32_t>
+  ComputeScratchpadSizeBatchMatMul(const luci::CircleBatchMatMul *batch_mat_mul) final
+  {
+    const auto lhs = loco::must_cast<luci::CircleNode *>(batch_mat_mul->x());
+    const auto rhs = loco::must_cast<luci::CircleNode *>(batch_mat_mul->y());
+
+    std::vector<uint32_t> scratchpad_sizes;
+
+    // Scratchpad for lhs
+    uint32_t scratchpad_size = 1;
+    for (int32_t i = 0; i < lhs->rank(); ++i)
+      scratchpad_size *= lhs->dim(i).value();
+
+    scratchpad_sizes.push_back(scratchpad_size * loco::size(lhs->dtype()));
+
+    // Scratchpad for rhs
+    scratchpad_size = 1;
+    for (int32_t i = 0; i < rhs->rank(); ++i)
+      scratchpad_size *= rhs->dim(i).value();
+
+    scratchpad_sizes.push_back(scratchpad_size * loco::size(rhs->dtype()));
+
+    return scratchpad_sizes;
+  }
+
+  uint32_t ComputeScratchpadSizeConv2d(const luci::CircleConv2D *conv) final
+  {
+    const auto conv_input = loco::must_cast<luci::CircleNode *>(conv->input());
+    const auto filter = loco::must_cast<luci::CircleNode *>(conv->filter());
+
+    const uint32_t stride_height = conv->stride()->h();
+    const uint32_t stride_width = conv->stride()->w();
+
+    const uint32_t dilation_height_factor = conv->dilation()->h();
+    const uint32_t dilation_width_factor = conv->dilation()->w();
+
+    const uint32_t filter_height = filter->dim(1).value();
+    const uint32_t filter_width = filter->dim(2).value();
+
+    const bool need_dilated_im2col = dilation_height_factor != 1 || dilation_width_factor != 1;
+    const bool need_non_dilated_im2col =
+      stride_height != 1 || stride_width != 1 || filter_height != 1 || filter_width != 1;
+    const bool need_im2col = conv_input->dtype() != loco::DataType::S16 &&
+                             (need_dilated_im2col || need_non_dilated_im2col);
+
+    if (!need_im2col)
+    {
+      return 0;
+    }
+
+    const uint32_t input_depth = conv_input->dim(3).value();
+    const uint32_t batches = conv_input->dim(0).value();
+
+    const uint32_t output_height = conv->dim(1).value();
+    const uint32_t output_width = conv->dim(2).value();
+
+    return batches * output_height * output_width * input_depth * filter_height * filter_width *
+           size(conv_input->dtype());
+  }
+
+  uint32_t
+  ComputeScratchpadSizeDepthwiseConv2d(const luci::CircleDepthwiseConv2D *depthwise_conv) final
+  {
+    // for linux DepthwiseConv2d scratchpad tensors size = 0
+    return 0;
+  }
+
+  std::vector<uint32_t> ComputeScratchpadSizeSVDF(const luci::CircleSVDF *svdf) final
+  {
+    const auto svdf_input = loco::must_cast<luci::CircleNode *>(svdf->input());
+    const auto weight_feature_input = loco::must_cast<luci::CircleNode *>(svdf->weight_feature());
+
+    if (svdf_input->dtype() == loco::DataType::FLOAT32 and
+        (weight_feature_input->dtype() == loco::DataType::S8 or
+         weight_feature_input->dtype() == loco::DataType::U8))
+    {
+      throw std::runtime_error("Hybrid type is not currently supported for linux platform");
+    }
+
+    std::vector<uint32_t> scratchpad_sizes;
+
+    const auto batch_size = svdf_input->dim(0).value();
+    const auto num_filters = weight_feature_input->dim(0).value();
+    const auto rank = svdf->svdf_rank();
+    const auto num_units = num_filters / rank;
+
+    if (svdf_input->dtype() == loco::DataType::S8)
+    {
+      scratchpad_sizes.push_back(batch_size * num_filters * sizeof(int32_t));
+      scratchpad_sizes.push_back(batch_size * num_units * sizeof(int32_t));
+    }
+    else
+    {
+      scratchpad_sizes.push_back(batch_size * num_filters * sizeof(float));
+    }
+
+    return scratchpad_sizes;
+  }
+};
+
+} // namespace circle_planner
+
+#endif // CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_LINUX_H
diff --git a/compiler/circle-execution-plan/pal/ScratchpadHelperMCU.h b/compiler/circle-execution-plan/pal/ScratchpadHelperMCU.h
new file mode 100644
index 000000000..14b41640c
--- /dev/null
+++ b/compiler/circle-execution-plan/pal/ScratchpadHelperMCU.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_MCU_H
+#define CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_MCU_H
+
+#include "IScratchpadHelper.h"
+
+namespace circle_planner
+{
+
+class ScratchpadHelperMCU : public IScratchpadHelper
+{
+public:
+  uint32_t ComputeScratchpadSizeAveragePool2d(const luci::CircleAveragePool2D *avg_pool) final
+  {
+    // for mcu AveragePool2d scratchpad tensors size = 0
+    return 0;
+  }
+
+  std::vector<uint32_t>
+  ComputeScratchpadSizeBatchMatMul(const luci::CircleBatchMatMul *batch_mat_mul) final
+  {
+    throw std::runtime_error("BatchMatMul is not currently supported for mcu platform");
+  }
+
+  uint32_t ComputeScratchpadSizeConv2d(const luci::CircleConv2D *) final
+  {
+    // for mcu scratchpad size = 0
+    return 0;
+  }
+
+  uint32_t
+  ComputeScratchpadSizeDepthwiseConv2d(const luci::CircleDepthwiseConv2D *depthwise_conv) final
+  {
+    // for mcu DepthwiseConv2d scratchpad tensors size = 0
+    return 0;
+  }
+
+  std::vector<uint32_t> ComputeScratchpadSizeSVDF(const luci::CircleSVDF *svdf) final
+  {
+    const auto svdf_input = loco::must_cast<luci::CircleNode *>(svdf->input());
+    const auto weight_feature_input = loco::must_cast<luci::CircleNode *>(svdf->weight_feature());
+
+    if (svdf_input->dtype() == loco::DataType::FLOAT32 and
+        (weight_feature_input->dtype() == loco::DataType::S8 or
+         weight_feature_input->dtype() == loco::DataType::U8))
+    {
+      throw std::runtime_error("Hybrid type is not currently supported for linux platform");
+    }
+
+    std::vector<uint32_t> scratchpad_sizes;
+
+    const auto batch_size = svdf_input->dim(0).value();
+    const auto num_filters = weight_feature_input->dim(0).value();
+    const auto rank = svdf->svdf_rank();
+    const auto num_units = num_filters / rank;
+
+    if (svdf_input->dtype() == loco::DataType::S8)
+    {
+      scratchpad_sizes.push_back(batch_size * num_filters * sizeof(int32_t));
+      scratchpad_sizes.push_back(batch_size * num_units * sizeof(int32_t));
+    }
+    else
+    {
+      scratchpad_sizes.push_back(batch_size * num_filters * sizeof(float));
+    }
+
+    return scratchpad_sizes;
+  }
+};
+
+} // namespace circle_planner
+
+#endif // CIRCLE_EXECUTION_PLAN_SCRATCHPAD_HELPER_MCU_H
diff --git a/compiler/circle-execution-plan/pal/TargetPlatform.h b/compiler/circle-execution-plan/pal/TargetPlatform.h
new file mode 100644
index 000000000..538a502fe
--- /dev/null
+++ b/compiler/circle-execution-plan/pal/TargetPlatform.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CIRCLE_EXECUTION_PLAN_TARGET_PLATFORM_H
+#define CIRCLE_EXECUTION_PLAN_TARGET_PLATFORM_H
+
+namespace circle_planner
+{
+
+enum SupportedPlatformType
+{
+  LINUX,
+  MCU,
+  CMSISNN
+};
+
+struct TargetPlatform
+{
+  SupportedPlatformType platform_type;
+  bool use_dsp;
+};
+
+} // namespace circle_planner
+
+#endif // CIRCLE_EXECUTION_PLAN_TARGET_PLATFORM_H
diff --git a/compiler/circle-execution-plan/src/CircleExecutionPlan.cpp b/compiler/circle-execution-plan/src/CircleExecutionPlan.cpp
index a54100b8c..1788124c3 100644
--- a/compiler/circle-execution-plan/src/CircleExecutionPlan.cpp
+++ b/compiler/circle-execution-plan/src/CircleExecutionPlan.cpp
@@ -35,6 +35,18 @@ int entry(int argc, char **argv)
 
   arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model");
   arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model");
+  arser.add_argument("--platform")
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .required(false)
+    .default_value("linux")
+    .help("Platform name: linux mcu cmsisnn");
+  arser.add_argument("--use_dsp")
+    .nargs(1)
+    .type(arser::DataType::BOOL)
+    .required(false)
+    .default_value(false)
+    .help("Plan with or without dsp (now can be used only with cmsisnn)");
 
   try
   {
@@ -47,8 +59,35 @@ int entry(int argc, char **argv)
     return 255;
   }
 
-  std::string input_path = arser.get<std::string>("input");
-  std::string output_path = arser.get<std::string>("output");
+  const std::string input_path = arser.get<std::string>("input");
+  const std::string output_path = arser.get<std::string>("output");
+  const std::string platform_name = arser.get<std::string>("--platform");
+  const bool use_dsp = arser.get<bool>("--use_dsp");
+
+  if (platform_name != "cmsisnn" && use_dsp)
+  {
+    std::cerr << "ERROR: Now use_dsp can be used only with cmsisnn" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  circle_planner::SupportedPlatformType platform_type;
+  if (platform_name == "linux")
+  {
+    platform_type = circle_planner::SupportedPlatformType::LINUX;
+  }
+  else if (platform_name == "mcu")
+  {
+    platform_type = circle_planner::SupportedPlatformType::MCU;
+  }
+  else if (platform_name == "cmsisnn")
+  {
+    platform_type = circle_planner::SupportedPlatformType::CMSISNN;
+  }
+  else
+  {
+    std::cerr << "ERROR: Invalid platform name '" << platform_name << "'" << std::endl;
+    return EXIT_FAILURE;
+  }
 
   foder::FileLoader file_loader{input_path};
   std::vector<char> model_data;
@@ -82,8 +121,8 @@ int entry(int argc, char **argv)
   auto module = importer.importModule(circle_model);
 
   // Do main job
-  luci::ExecutionPlanner execution_planner(module->graph());
-  execution_planner.get_execution_plan();
+  circle_planner::ExecutionPlanner execution_planner(module->graph(), {platform_type, use_dsp});
+  execution_planner.make_execution_plan();
 
   // Export to output Circle file
   luci::CircleExporter exporter;
diff --git a/compiler/circle-execution-plan/src/ExecutionPlanner.cpp b/compiler/circle-execution-plan/src/ExecutionPlanner.cpp
index c37d1e5f5..ec2ec1362 100644
--- a/compiler/circle-execution-plan/src/ExecutionPlanner.cpp
+++ b/compiler/circle-execution-plan/src/ExecutionPlanner.cpp
@@ -18,72 +18,49 @@
 #include <loco/IR/Algorithm.h>
 #include <luci/UserSettings.h>
 
-namespace luci
+namespace circle_planner
 {
 namespace
 {
 
-constexpr uint32_t nodeNotAssigned = std::numeric_limits<int32_t>::max();
+constexpr uint32_t node_not_assigned = std::numeric_limits<int32_t>::max();
 
-uint32_t compute_output_size(Padding padding, uint32_t image_size, uint32_t filter_size,
-                             uint32_t stride, uint32_t dilation_rate = 1)
+bool isExecutableNode(const luci::CircleNode *node)
 {
-  const int32_t effective_filter_size = (filter_size - 1) * dilation_rate + 1;
-  switch (padding)
+  switch (node->opcode())
   {
-    case Padding::SAME:
-      return (image_size + stride - 1) / stride;
-    case Padding::VALID:
-      return (image_size + stride - effective_filter_size) / stride;
+    // The following nodes denote outputs of multiple-output nodes.
+    // The list is synchronized with the same list from luci-interpreter/src/loader/GraphLoader.cpp
+    case luci::CircleOpcode::CIRCLEIFOUT:
+    case luci::CircleOpcode::CIRCLESPLITOUT:
+    case luci::CircleOpcode::CIRCLESPLITVOUT:
+    case luci::CircleOpcode::CIRCLEUNPACKOUT:
+    case luci::CircleOpcode::CIRCLEWHILEOUT:
+      return false;
     default:
-      assert(false);
+      return true;
   }
 }
 
-// Method finds (if necessary) size for im2col temporary tensor.
-uint32_t compute_im2col_size(const luci::CircleConv2D *conv)
+bool isTensorProducingNode(const luci::CircleNode *node)
 {
-  auto conv_input = loco::must_cast<luci::CircleNode *>(conv->input());
-  auto filter = loco::must_cast<luci::CircleNode *>(conv->filter());
-  auto padding = (conv->padding());
-  uint32_t stride_height = conv->stride()->h();
-  uint32_t stride_width = conv->stride()->w();
-
-  uint32_t dilation_height_factor = conv->dilation()->h();
-  uint32_t dilation_width_factor = conv->dilation()->w();
-
-  uint32_t filter_height = filter->dim(1).value();
-  uint32_t filter_width = filter->dim(2).value();
-
-  const bool need_dilated_im2col = dilation_height_factor != 1 || dilation_width_factor != 1;
-  const bool need_non_dilated_im2col =
-    stride_height != 1 || stride_width != 1 || filter_height != 1 || filter_width != 1;
-  bool need_im2col =
-    conv_input->dtype() != loco::DataType::S16 && (need_dilated_im2col || need_non_dilated_im2col);
-
-  if (!need_im2col)
+  switch (node->opcode())
   {
-    return 0;
+    // The following nodes are multiple-output nodes. They do not produce tensors, the tensors
+    // are produced by the corresponding *Out nodes instead.
+    // The list is synchronized with the same list from luci-interpreter/src/loader/GraphLoader.cpp
+    case luci::CircleOpcode::IF:
+    case luci::CircleOpcode::SPLIT:
+    case luci::CircleOpcode::UNPACK:
+      return false;
+    default:
+      return true;
   }
-
-  uint32_t input_depth = conv_input->dim(3).value();
-  uint32_t input_height = conv_input->dim(1).value();
-  uint32_t input_width = conv_input->dim(2).value();
-
-  uint32_t output_height = compute_output_size(padding, input_height, filter_height, stride_height,
-                                               dilation_height_factor);
-  uint32_t output_width =
-    compute_output_size(padding, input_width, filter_width, stride_width, dilation_width_factor);
-
-  uint32_t batches = conv_input->dim(0).value();
-
-  return batches * output_height * output_width * input_depth * filter_height * filter_width *
-         size(conv_input->dtype());
 }
 
 } // namespace
 
-void ExecutionPlanner::get_execution_plan()
+void ExecutionPlanner::make_execution_plan()
 {
   get_default_execution_order_plan();
   _required_size = get_offsets_with_greedy_by_size();
@@ -106,23 +83,23 @@ void ExecutionPlanner::get_default_execution_order_plan()
 void ExecutionPlanner::get_usage_interval()
 {
   // Initialize vectors of first and last nodes for usage interval
-  _alloc_node.assign(_ordered_nodes.size(), nodeNotAssigned);
-  _dealloc_node.assign(_ordered_nodes.size(), nodeNotAssigned);
+  _alloc_node.assign(_ordered_nodes.size(), node_not_assigned);
+  _dealloc_node.assign(_ordered_nodes.size(), node_not_assigned);
 
   // Vector for count usages
   std::vector<int> usages_counts(_ordered_nodes.size(), 0);
 
   auto allocate = [this](uint32_t node, uint32_t tensor) {
-    if (_alloc_node[tensor] != nodeNotAssigned)
+    if (_alloc_node[tensor] != node_not_assigned)
     {
       return;
     }
-    assert(_dealloc_node[tensor] == nodeNotAssigned);
+    assert(_dealloc_node[tensor] == node_not_assigned);
     _alloc_node[tensor] = node;
   };
 
   auto deallocate = [this](uint32_t node, uint32_t tensor) {
-    assert(_dealloc_node[tensor] == nodeNotAssigned);
+    assert(_dealloc_node[tensor] == node_not_assigned);
     _dealloc_node[tensor] = node;
   };
 
@@ -158,13 +135,24 @@ void ExecutionPlanner::get_usage_interval()
   for (uint32_t i = 0; i < _ordered_nodes.size(); i++)
   {
     const auto node = _ordered_nodes.at(i);
+    auto prev_nodes = preds(node);
     if (const auto *const_node = dynamic_cast<const luci::CircleConst *>(node))
     {
       allocate(0, i);
     }
-    allocate(i, i);
+    else if (!isExecutableNode(loco::must_cast<luci::CircleNode *>(node)))
+    {
+      // If current node is multi output node than begin life time for current node should start
+      // when prev node start live
+      auto it = std::find(_ordered_nodes.begin(), _ordered_nodes.end(), *prev_nodes.begin());
+      size_t index = std::distance(_ordered_nodes.begin(), it);
+      allocate(index, i);
+    }
+    else
+    {
+      allocate(i, i);
+    }
 
-    auto prev_nodes = preds(node);
     for (auto &prev_node : prev_nodes)
     {
       auto it = std::find(_ordered_nodes.begin(), _ordered_nodes.end(), prev_node);
@@ -203,7 +191,7 @@ uint32_t ExecutionPlanner::get_offsets_with_greedy_by_size()
 uint32_t ExecutionPlanner::greedy_by_size_approach()
 {
   size_t result_size = 0;
-  create_alloc_node_inform_vector(false, false, false);
+  create_alloc_node_inform_vector(_is_null_consts, _is_null_inputs, _is_null_scratchpads);
   std::vector<AllocationNodeInformation> ordered_alloc_inform;
   for (auto &current_node : _alloc_node_inform_vector)
   {
@@ -250,22 +238,22 @@ uint32_t ExecutionPlanner::greedy_by_size_approach()
 }
 
 void ExecutionPlanner::create_alloc_node_inform_vector(bool null_consts, bool null_inputs,
-                                                       bool null_im2col)
+                                                       bool null_scratchpad)
 {
   auto node_compare = [this](const AllocationNodeInformation &alloc_1,
                              const AllocationNodeInformation &alloc_2) {
     auto idx1 = alloc_1.node_num;
     auto idx2 = alloc_2.node_num;
 
-    if (this->_alloc_node[idx1] == 0 && this->_dealloc_node[idx1] == nodeNotAssigned)
+    if (this->_alloc_node[idx1] == 0 && this->_dealloc_node[idx1] == node_not_assigned)
     {
-      if (this->_alloc_node[idx2] == 0 && this->_dealloc_node[idx2] == nodeNotAssigned)
+      if (this->_alloc_node[idx2] == 0 && this->_dealloc_node[idx2] == node_not_assigned)
       {
         return idx1 < idx2;
       }
       return true;
     }
-    if (this->_alloc_node[idx2] == 0 && this->_dealloc_node[idx2] == nodeNotAssigned)
+    if (this->_alloc_node[idx2] == 0 && this->_dealloc_node[idx2] == node_not_assigned)
     {
       return false;
     }
@@ -305,30 +293,66 @@ void ExecutionPlanner::create_alloc_node_inform_vector(bool null_consts, bool nu
     {
       _alloc_node_inform_vector[i].size = 0;
     }
+    else if (!isTensorProducingNode(circle_node))
+    {
+      _alloc_node_inform_vector[i].size = 0;
+    }
     else
     {
       _alloc_node_inform_vector[i].size = node_size;
     }
 
-    // Im2col
-    auto opcode = circle_node->opcode();
-    if (opcode == luci::CircleOpcode::CONV_2D)
+    // Scratchpad If needed
+    std::vector<uint32_t> scratchpad_sizes;
+    if (!null_scratchpad)
     {
-      auto conv = loco::must_cast<const luci::CircleConv2D *>(circle_node);
-      auto im2col_size = compute_im2col_size(conv);
-      if (im2col_size > 0)
+      switch (circle_node->opcode())
       {
-        AllocationNodeInformation temp_alloc;
-
-        if (null_im2col)
+        case luci::CircleOpcode::AVERAGE_POOL_2D:
         {
-          temp_alloc.size = 0;
+          const auto avg_pool = loco::must_cast<const luci::CircleAveragePool2D *>(circle_node);
+          scratchpad_sizes.push_back(
+            _scratchpad_helper->ComputeScratchpadSizeAveragePool2d(avg_pool));
+          break;
         }
-        else
+        case luci::CircleOpcode::BATCH_MATMUL:
         {
-          temp_alloc.size = im2col_size;
+          const auto batch_mat_mul = loco::must_cast<const luci::CircleBatchMatMul *>(circle_node);
+          scratchpad_sizes = _scratchpad_helper->ComputeScratchpadSizeBatchMatMul(batch_mat_mul);
+          break;
         }
+        case luci::CircleOpcode::CONV_2D:
+        {
+          const auto conv = loco::must_cast<const luci::CircleConv2D *>(circle_node);
+          scratchpad_sizes.push_back(_scratchpad_helper->ComputeScratchpadSizeConv2d(conv));
+          break;
+        }
+        case luci::CircleOpcode::DEPTHWISE_CONV_2D:
+        {
+          const auto depthwise_conv =
+            loco::must_cast<const luci::CircleDepthwiseConv2D *>(circle_node);
+          scratchpad_sizes.push_back(
+            _scratchpad_helper->ComputeScratchpadSizeDepthwiseConv2d(depthwise_conv));
+          break;
+        }
+        case luci::CircleOpcode::SVDF:
+        {
+          const auto svdf = loco::must_cast<const luci::CircleSVDF *>(circle_node);
+          scratchpad_sizes = _scratchpad_helper->ComputeScratchpadSizeSVDF(svdf);
+          break;
+        }
+        default:
+          break;
+      }
+    }
+
+    for (const auto scratchpad_size : scratchpad_sizes)
+    {
+      if (scratchpad_size > 0)
+      {
+        AllocationNodeInformation temp_alloc;
 
+        temp_alloc.size = scratchpad_size;
         temp_alloc.first_node = i - 1;
         temp_alloc.last_node = i + 1;
         temp_alloc.node_num = i;
@@ -352,7 +376,7 @@ void ExecutionPlanner::dump_inform()
   {
     auto current_node_it = std::find_if(
       _alloc_node_inform_vector.begin(), _alloc_node_inform_vector.end(),
-      [this, i](const AllocationNodeInformation &x) { return x.node_num == i && !x.is_temp; });
+      [i](const AllocationNodeInformation &x) { return x.node_num == i && !x.is_temp; });
     for (uint32_t j = 0; j < _ordered_nodes.size(); j++)
     {
       auto first_node = _alloc_node[j];
@@ -360,7 +384,7 @@ void ExecutionPlanner::dump_inform()
 
       auto it = std::find_if(
         _alloc_node_inform_vector.begin(), _alloc_node_inform_vector.end(),
-        [this, j](const AllocationNodeInformation &x) { return x.node_num == j && !x.is_temp; });
+        [j](const AllocationNodeInformation &x) { return x.node_num == j && !x.is_temp; });
       if (i >= first_node && i <= last_node)
       {
         current_node_it->breadth += it->size;
@@ -386,4 +410,4 @@ void ExecutionPlanner::dump_inform()
             });
 }
 
-} // namespace luci
+} // namespace circle_planner
diff --git a/compiler/circle-execution-plan/src/ExecutionPlanner.h b/compiler/circle-execution-plan/src/ExecutionPlanner.h
index 8e3d9b46a..e0833c407 100644
--- a/compiler/circle-execution-plan/src/ExecutionPlanner.h
+++ b/compiler/circle-execution-plan/src/ExecutionPlanner.h
@@ -17,10 +17,15 @@
 #ifndef CIRCLE_EXECUTION_PLANNER_H
 #define CIRCLE_EXECUTION_PLANNER_H
 
+#include "TargetPlatform.h"
+#include "IScratchpadHelper.h"
+#include "ScratchpadHelperLinux.h"
+#include "ScratchpadHelperMCU.h"
+#include "ScratchpadHelperCMSISNN.h"
 #include <luci/IR/Module.h>
 #include <luci/Plan/CircleNodeExecutionPlan.h>
 
-namespace luci
+namespace circle_planner
 {
 // struct for additional information for the node. it helps build allocations plan for nodes.
 struct AllocationNodeInformation
@@ -50,7 +55,7 @@ struct AllocationNodeInformation
   uint32_t last_node;
   // is the current node temporary or not
   bool is_temp;
-  // operation breadth of current node
+  // Breadth is a sum of live tensors sizes at the moment of execution of given node
   uint32_t breadth;
 
   bool operator<(const AllocationNodeInformation &other) const { return offset < other.offset; }
@@ -60,12 +65,44 @@ class ExecutionPlanner
 {
 public:
   ExecutionPlanner() = delete;
-  explicit ExecutionPlanner(loco::Graph *graph) { _graph = graph; };
+  explicit ExecutionPlanner(loco::Graph *graph) : _graph(graph)
+  {
+    _scratchpad_helper = std::make_unique<ScratchpadHelperLinux>();
+  }
+
+  explicit ExecutionPlanner(loco::Graph *graph, TargetPlatform target_platform) : _graph(graph)
+  {
+    switch (target_platform.platform_type)
+    {
+      case LINUX:
+        _scratchpad_helper = std::make_unique<ScratchpadHelperLinux>();
+        break;
+      case MCU:
+        _scratchpad_helper = std::make_unique<ScratchpadHelperMCU>();
+        break;
+      case CMSISNN:
+        _scratchpad_helper = std::make_unique<ScratchpadHelperCMSISNN>(target_platform.use_dsp);
+        break;
+      default:
+        assert(false && "Use unsupported platform");
+    }
+  };
 
   // Method provides execution plan, which contains execution order and
   // memory offsets for all nodes in _graph.
   // This plan writes in nodes annotation information with help of CircleNodeExecutionPlan class.
-  void get_execution_plan();
+  void make_execution_plan();
+
+  // Method change planning mode:
+  // is_null_consts = true - constants are no longer taken into account when planning
+  // is_null_inputs = true - input are no longer taken into account when planning
+  // is_null_scratchpads = true - scratchpads are no longer taken into account when planning
+  void change_planning_mode(bool is_null_consts, bool is_null_inputs, bool is_null_scratchpads)
+  {
+    _is_null_consts = is_null_consts;
+    _is_null_inputs = is_null_inputs;
+    _is_null_scratchpads = is_null_scratchpads;
+  };
 
 private:
   // Method gets default execution order plan and saves it in _ordered_nodes vector.
@@ -83,18 +120,19 @@ private:
   // Return: required size of buffer.
   uint32_t get_offsets_with_greedy_by_size();
 
-  // Realization of greedy by size approach to find offsets for nodes.
+  // Realization of greedy by size approach (algorithm is mentioned in
+  // "EFFICIENT MEMORY MANAGEMENT FOR DEEP NEURAL NET INFERENCE" paper) to find offsets for nodes.
   uint32_t greedy_by_size_approach();
 
   // Method creates and fills _alloc_node_inform_vector with usage interval inform and node's sizes.
   // null_consts = true - size of const nodes will be equal 0;
   // null_inputs = true - size of input nodes will be equal 0;
-  // null_im2col = true - size of im2col nodes will be equal 0;
-  // It using if we don't want to take input(const or im2col) nodes into account
+  // null_scratchpad = true - size of scratchpad nodes will be equal 0;
+  // It using if we don't want to take input(const or scratchpads) nodes into account
   // when determining offsets and calculating the required buffer size. This is uses for
   // experiments.
   void create_alloc_node_inform_vector(bool null_consts = false, bool null_inputs = false,
-                                       bool null_im2col = false);
+                                       bool null_scratchpad = false);
 
   // Stores allocation additional information for the all nodes from _graph.
   std::vector<AllocationNodeInformation> _alloc_node_inform_vector;
@@ -121,10 +159,21 @@ private:
 
   loco::Graph *_graph;
 
+  // Calculate size of scratchpad tensors for current platform
+  std::unique_ptr<IScratchpadHelper> _scratchpad_helper;
+
   // Required memory size.
   uint32_t _required_size = 0;
+
+  // Flags for choosing different planning modes:
+  // _is_null_consts = true - constants are no longer taken into account when planning
+  // _is_null_inputs = true - input are no longer taken into account when planning
+  // _is_null_scratchpads = true - scratchpads are no longer taken into account when planning
+  bool _is_null_consts = false;
+  bool _is_null_inputs = false;
+  bool _is_null_scratchpads = false;
 };
 
-} // namespace luci
+} // namespace circle_planner
 
 #endif // CIRCLE_EXECUTION_PLANNER_H
author	Chunseok Lee <chunseok.lee@samsung.com>	2022-04-15 19:15:11 +0900
committer	Chunseok Lee <chunseok.lee@samsung.com>	2022-04-15 19:15:11 +0900
commit	3ad689f0803519e343c36d5700646e86059df961 (patch)
tree	862346c401a5577518fa7f042532aa931b53aa0e /compiler/circle-execution-plan
parent	ac6e4dd7b480e83b586ef533d7b29a8a97eb48fe (diff)
download	nnfw-3ad689f0803519e343c36d5700646e86059df961.tar.gz nnfw-3ad689f0803519e343c36d5700646e86059df961.tar.bz2 nnfw-3ad689f0803519e343c36d5700646e86059df961.zip