1 files changed, 628 insertions, 0 deletions
diff --git a/runtime/neurun/core/src/compiler/HEScheduler.cc b/runtime/neurun/core/src/compiler/HEScheduler.cc
new file mode 100644
index 000000000..aec68d655
--- /dev/null
+++ b/runtime/neurun/core/src/compiler/HEScheduler.cc
@@ -0,0 +1,628 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ir/Operand.h"
+#include "compiler/HEScheduler.h"
+#include "ir/Graph.h"
+#include "util/ConfigSource.h"
+#include "compiler/IExecutionBuilder.h"
+#include "compiler/BackendResolver.h"
+#include "backend/IShapeFixer.h"
+#include "util/logging.h"
+#include "util/Utils.h"
+#include "exec/FunctionSequence.h"
+#include <cassert>
+#include <cmath>
+#include <chrono>
+
+namespace neurun
+{
+
+namespace compiler
+{
+static uint32_t getOperationsFlattenedIOSize(const ir::Graph &graph, const ir::Operation &node)
+{
+  uint32_t size = 0;
+  for (const auto &input : node.getInputs())
+  {
+    size += graph.operands().at(input).info().total_size();
+  }
+  for (const auto &output : node.getOutputs())
+  {
+    size += graph.operands().at(output).info().total_size();
+  }
+  return size;
+}
+
+static bool isQuant(const ir::Graph &graph, const ir::Operation &node)
+{
+  for (const auto &input : node.getInputs())
+  {
+    const auto &obj = graph.operands().at(input);
+    if (obj.typeInfo().type() == ir::DataType::QUANT8_ASYMM)
+    {
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool isWorkaroundSkip(const ir::Graph &graph, const backend::Backend *backend,
+                             const ir::Operation &node, bool quant)
+{
+  /* TODO: this is workaround, come up with better solution if have.
+      Adding exception in stage doesn't help. Because if there is a record for add without
+      broadcast, scheduling will select it since it doesn't distinguish broadcast and
+      non-broadcast like it does for quant non-quantized*/
+  if (backend->config()->id() == "cpu" &&
+      (node.opcode() == ir::OpCode::Add || node.opcode() == ir::OpCode::Sub ||
+       node.opcode() == ir::OpCode::Mul))
+  {
+    const auto lhs_index{node.getInputs().at(ir::operation::Add::Input::LHS)};
+    const auto rhs_index{node.getInputs().at(ir::operation::Add::Input::RHS)};
+    /*Broadcasting isn't supported on CPU: no way to differ the existing exec_time record with and
+     * without broadcasting*/
+    if (!(graph.operands().at(lhs_index).shape() == graph.operands().at(rhs_index).shape()))
+    {
+      return true;
+    }
+  }
+  /* TODO: this is workaround, come up with better solution if have.
+          Adding exception in stage doesn't help. Because if there is a record for Mul without
+          broadcast, scheduling will select it since it doesn't distinguish broadcast and
+          non-broadcast like it does for quant non-quantized*/
+  else if (backend->config()->id() == "acl_neon" && node.opcode() == ir::OpCode::Mul)
+  {
+    const auto lhs_index{node.getInputs().at(ir::operation::Mul::Input::LHS)};
+    const auto rhs_index{node.getInputs().at(ir::operation::Mul::Input::RHS)};
+
+    // Nontrivial broadcasting isn't supported yet
+    if (quant ||
+        !(graph.operands().at(lhs_index).shape() == graph.operands().at(rhs_index).shape()))
+    {
+      return true;
+    }
+  }
+  return false;
+}
+
+// if a node can be merged into op_seq
+static bool isMergeable(const ir::Graph &graph, const ir::Operation &node)
+{
+  size_t prev_op_cnt = 0;
+  for (const auto &input : node.getInputs())
+  {
+    // only valid_inputs
+    const auto &operand = graph.operands().at(input);
+    if (operand.isConstant())
+      continue;
+
+    // This operand is output of operation, not weight or bias
+    if (operand.getDef().list().size() > 0)
+      ++prev_op_cnt;
+
+    // Current node has multiple inputs as concat or at the beginning of the separated branch
+    if (prev_op_cnt > 1 || operand.getUses().list().size() > 1)
+    {
+      return false;
+    }
+  }
+  return true;
+}
+
+void HEScheduler::scheduleShufflingBackends()
+{
+  VERBOSE(HEScheduler::schedule)
+      << "Started task scheduling: uses all backends to get more metrics for data transfer"
+      << std::endl;
+  size_t backend_ind = 0;
+  for (const auto &rank : _rank_to_op)
+  {
+    VERBOSE(HEScheduler::schedule) << "scheduling (" << rank.second.value() << ")" << std::endl;
+    const auto &node = _graph->operations().at(rank.second);
+    const bool quant = isQuant(*_graph, node);
+    const auto size = getOperationsFlattenedIOSize(*_graph, node);
+    for (size_t i = 0;; ++i)
+    {
+      if (i == _all_backends.size())
+      {
+        // wasn't able to find backend
+        assert(false);
+        break;
+      }
+      if (backend_ind == _all_backends.size())
+      {
+        backend_ind = 0;
+      }
+      if (isWorkaroundSkip(*_graph, _all_backends[backend_ind], node, quant))
+      {
+        ++backend_ind;
+        continue;
+      }
+      const auto exec_time =
+          _exec_time->getOperationExecTime(_all_backends[backend_ind], node.name(), quant, size);
+      // Scheduling to measure data transfer must be done after measuring all backends separately
+      assert(exec_time != _exec_time->NOT_FOUND);
+      if (exec_time == _exec_time->getMax())
+      {
+        ++backend_ind;
+        continue;
+      }
+      _backend_resolver->setBackend(rank.second, _all_backends[backend_ind]);
+      VERBOSE(HEScheduler::schedule) << "backend for " << node.name() << " is "
+                                     << _all_backends[backend_ind]->config()->id() << std::endl;
+      ++backend_ind;
+      break;
+    }
+  }
+}
+
+bool HEScheduler::isNodeProfiled(const ir::Operation &node)
+{
+  const bool quant = isQuant(*_graph, node);
+  const auto size = getOperationsFlattenedIOSize(*_graph, node);
+  for (const auto *backend : _all_backends)
+  {
+    const auto exec_time = _exec_time->getOperationExecTime(backend, node.name(), quant, size);
+    if (exec_time == _exec_time->NOT_FOUND)
+      return false;
+  }
+  return true;
+}
+
+void HEScheduler::scheduleBranch(const ir::OperationIndex &index,
+                                 ir::OperationIndexMap<bool> &scheduled)
+{
+  auto loc_index = index;
+  const backend::Backend *parent_backend = nullptr;
+  while (true)
+  {
+    if (scheduled[loc_index])
+    {
+      return;
+    }
+    if (!schedule(loc_index, parent_backend))
+    {
+      return;
+    }
+    scheduled[loc_index] = true;
+    parent_backend = _backend_resolver->getBackend(loc_index);
+
+    const auto &node = _graph->operations().at(loc_index);
+    /* get the only output operand, that is input of the next single operation
+     *   and just this nodes output.*/
+    if (node.getOutputs().size() != 1)
+    {
+      return;
+    }
+    const auto &only_out_operand = _graph->operands().at(*node.getOutputs().begin());
+    loc_index = only_out_operand.getUses().list().front();
+    /* verify, that next node is neither beginning nor ending node of a branch*/
+    const auto &next_node = _graph->operations().at(loc_index);
+    if (!isMergeable(*_graph, next_node))
+    {
+      return;
+    }
+  }
+}
+
+std::unique_ptr<compiler::BackendResolver> HEScheduler::schedule(const ir::Graph &graph)
+{
+  _graph = &graph;
+  VERBOSE(HEScheduler::schedule) << "task scheduling started" << std::endl;
+  // Make ranks and save in descending order
+  makeRank();
+
+  for (const auto *backend : _all_backends)
+  {
+    _backends_avail_time.emplace(backend, std::map<int64_t, int64_t>{{0, 0}});
+  }
+
+  const bool is_profiling = util::getConfigBool(util::config::PROFILING_MODE);
+  if (is_profiling)
+  {
+    // Check if profiling info about all backend/node pairs already exists
+    bool all_nodes_are_profiled = true;
+    _graph->operations().iterate([&](const ir::OperationIndex &, const ir::Operation &op) {
+      if (all_nodes_are_profiled)
+        all_nodes_are_profiled = isNodeProfiled(op);
+    });
+
+    // If all nodes are already profiled - schedule backends in such order, so more profiling
+    // information about between-backends data transfer could be collected
+    if (all_nodes_are_profiled)
+    {
+      scheduleShufflingBackends();
+      VERBOSE(HEScheduler::schedule) << "task scheduling finished" << std::endl;
+      return std::move(_backend_resolver);
+    }
+  }
+
+  ir::OperationIndexMap<bool> visited;
+  graph.operations().iterate(
+      [&](const ir::OperationIndex &index, const ir::Operation &) { visited[index] = false; });
+  // for each task select the backend with the smallest earliest finishing time(eft)
+  for (const auto &rank : _rank_to_op)
+  {
+    scheduleBranch(rank.second, visited);
+  }
+  VERBOSE(HEScheduler::schedule) << "task scheduling finished" << std::endl;
+  return std::move(_backend_resolver);
+}
+
+int64_t HEScheduler::getOpTime(const backend::Backend *backend, const std::string &operation,
+                               bool quant, uint32_t size)
+{
+  const auto time = _exec_time->getOperationExecTime(backend, operation, quant, size);
+  if (time != _exec_time->NOT_FOUND)
+    return time;
+
+  return _is_supported.at(backend).at(operation) ? 1 : _exec_time->getMax();
+}
+
+int64_t HEScheduler::getPermuteTime(const backend::Backend *src_backend,
+                                    const backend::Backend *dst_backend, bool quant, uint32_t size)
+{
+  const auto time = _exec_time->getPermuteTime(src_backend, dst_backend, quant, size);
+  if (time != _exec_time->NOT_FOUND)
+    return time;
+
+  // Makes the scheduler prefer keeping computations on one backend
+  return size / 200;
+}
+
+int64_t HEScheduler::tryBackend(const ir::Operation &node, const backend::Backend *backend)
+{
+  // if there is no profiling info don't use this backend during scheduling
+  if (!util::getConfigBool(util::config::PROFILING_MODE))
+  {
+    VERBOSE(HEScheduler::tryBackend)
+        << "Trying to HE schedule while there is no profiling info for " << node.name()
+        << " on backend " << backend->config()->id() << ". So this backend won't be used. "
+        << std::endl;
+    _is_supported[backend][node.name()] = false;
+    return _exec_time->getMax();
+  }
+  auto iter = _is_supported.find(backend);
+  if (iter != _is_supported.end())
+  {
+    auto it2 = iter->second.find(node.name());
+    if (it2 != iter->second.end())
+    {
+      return _is_supported[backend][node.name()] ? 1 : _exec_time->getMax();
+    }
+  }
+  try
+  {
+    node.accept(*_backend_resolver->getBackendContext(backend)->shape_fixer);
+
+    _is_supported[backend][node.name()] = true;
+  }
+  catch (std::runtime_error &e)
+  {
+    _is_supported[backend][node.name()] = false;
+  }
+  return _is_supported[backend][node.name()] ? 1 : _exec_time->getMax();
+}
+
+void HEScheduler::makeRank()
+{
+  VERBOSE(HEScheduler::makeRank) << "task prioritizing" << std::endl;
+
+  _graph->operations().iterate(
+      [&](const ir::OperationIndex &index, const ir::Operation &) { DFSMaxRank(index); });
+
+  // Check that ranks are calculated for all operations(nodes)
+  _graph->operations().iterate([&](const ir::OperationIndex &index, const ir::Operation &) {
+    UNUSED_RELEASE(index);
+    assert(_op_to_rank->find(index) != _op_to_rank->end());
+  });
+  VERBOSE(HEScheduler::makeRank) << "task prioritizing finished" << std::endl;
+}
+
+int64_t HEScheduler::DFSMaxRank(const ir::OperationIndex &index)
+{
+  auto op_to_rank_it = _op_to_rank->find(index);
+  if (op_to_rank_it != _op_to_rank->end())
+    return op_to_rank_it->second;
+
+  const auto &node = _graph->operations().at(index);
+  int64_t rank = 0;
+  const bool quant = isQuant(*_graph, node);
+  const auto size = getOperationsFlattenedIOSize(*_graph, node);
+  auto supported_backends_quantity = static_cast<int64_t>(_all_backends.size());
+
+  const auto max_child_rank = DFSChildrenMaxRank(index);
+
+  // get average exec time of this op
+  for (const auto &backend : _all_backends)
+  {
+    auto exec_time = _exec_time->getOperationExecTime(backend, node.name(), quant, size);
+    if (exec_time == _exec_time->NOT_FOUND)
+    {
+      exec_time = tryBackend(node, backend);
+    }
+    if (exec_time < _exec_time->getMax())
+    {
+      rank += exec_time;
+    }
+    else
+    {
+      // this operation isn't supported in this backend
+      --supported_backends_quantity;
+    }
+  }
+  if (supported_backends_quantity == 0)
+  {
+    throw std::runtime_error{"Encountered unsupported op: " + node.name()};
+  }
+  rank /= supported_backends_quantity;
+
+  // get standard deviation
+  int64_t std = 0;
+  for (const auto backend : _all_backends)
+  {
+    const auto exec_time = getOpTime(backend, node.name(), quant, size);
+    if (exec_time < _exec_time->getMax())
+    {
+      std += (exec_time - rank) * (exec_time - rank);
+    }
+  }
+  std /= supported_backends_quantity;
+  if (std > 0)
+  {
+    std = static_cast<int>(std::sqrt(std));
+    rank *= std;
+  }
+  rank += max_child_rank;
+
+  assert(rank >= 0);
+  _rank_to_op.emplace(rank, index);
+  _op_to_rank->emplace(index, rank);
+  VERBOSE(HEScheduler::DFSMaxRank) << "rank of operation (" << index.value() << ")" << node.name()
+                                   << " is " << rank << std::endl;
+
+  return rank;
+}
+
+int64_t HEScheduler::DFSChildrenMaxRank(const ir::OperationIndex &index)
+{
+  const auto &node = _graph->operations().at(index);
+  int64_t max_child_rank = 0;
+  for (const auto &output : node.getOutputs())
+  {
+    const auto &operand = _graph->operands().at(output);
+    const bool quant = operand.typeInfo().type() == ir::DataType::QUANT8_ASYMM;
+    // average data transfer cost of this operand's data
+    int64_t avg_transfer_cost = 1;
+    for (const auto *backend : _all_backends)
+    {
+      for (const auto *other_backend : _all_backends)
+      {
+        if (backend == other_backend)
+        {
+          continue;
+        }
+        auto transfer_cost =
+            _exec_time->getPermuteTime(backend, other_backend, quant, operand.info().total_size());
+        if (transfer_cost == _exec_time->NOT_FOUND)
+        {
+          // Makes the scheduler prefer keeping computations on one backend
+          transfer_cost = operand.info().total_size() / 100;
+        }
+        avg_transfer_cost += transfer_cost;
+      }
+    }
+    avg_transfer_cost /= _all_backends.size();
+    for (const auto &use : operand.getUses().list())
+    {
+      const auto cur_child_rank = DFSMaxRank(use);
+      max_child_rank = std::max(max_child_rank, cur_child_rank + avg_transfer_cost);
+    }
+  }
+  return max_child_rank;
+}
+
+int64_t HEScheduler::backendAvailableTime(const backend::Backend *backend,
+                                          const int64_t &starting_time, const int64_t &time_amount)
+{
+  const auto backend_times = _backends_avail_time.at(backend);
+  // finishing and starting times of an op, that will come after current op
+  auto next_op_fst = backend_times.upper_bound(starting_time);
+  // finishing time of an op, that will come before current op
+  auto prev_op_ft = starting_time;
+  // until reach the "hole/gap", that is enough to run this op
+  while (next_op_fst != backend_times.end() && next_op_fst->second - prev_op_ft <= time_amount)
+  {
+    prev_op_ft = next_op_fst->first + 1;
+    ++next_op_fst;
+  }
+  return prev_op_ft;
+}
+
+bool HEScheduler::schedule(const ir::OperationIndex &index, const backend::Backend *parent_backend)
+{
+  VERBOSE(HEScheduler::schedule) << "scheduling (" << index.value() << ")" << std::endl;
+  int64_t eft = std::numeric_limits<int64_t>::max(), selected_exec_time = 0;
+  const auto &node = _graph->operations().at(index);
+
+  std::multimap<int64_t, int64_t> selected_transfer_st_exec_time;
+  // select the backend with the smallest eft of this task
+  const backend::Backend *chosen_backend = nullptr;
+  for (const auto *backend : _all_backends)
+  {
+    std::multimap<int64_t, int64_t> transfer_st_exec_time;
+    const auto est_and_et = ESTAndExecTime(backend, index, transfer_st_exec_time);
+
+    if (eft > est_and_et.first + est_and_et.second)
+    {
+      eft = est_and_et.first + est_and_et.second;
+      selected_exec_time = est_and_et.second;
+      chosen_backend = backend;
+      selected_transfer_st_exec_time = transfer_st_exec_time;
+    }
+  }
+
+  if (chosen_backend == nullptr)
+  {
+    throw std::runtime_error{"Fail to choose backend on scheduler"};
+  }
+
+  // this is part of a branch and it is assigned another backend
+  if (parent_backend && parent_backend != chosen_backend)
+  {
+    return false;
+  }
+  for (const auto &it : selected_transfer_st_exec_time)
+  {
+    auto prev_op_ft = backendAvailableTime(_cpu_backend, it.first, it.second);
+    _backends_avail_time[_cpu_backend].insert({prev_op_ft + it.second, prev_op_ft});
+  }
+
+  _ops_eft[index] = eft;
+  _backends_avail_time[chosen_backend].emplace(eft, eft - selected_exec_time);
+  _backend_resolver->setBackend(index, chosen_backend);
+
+  VERBOSE(HEScheduler::schedule) << "backend for " << node.name() << " is "
+                                 << chosen_backend->config()->id() << ". Its eft: " << eft
+                                 << std::endl;
+  return true;
+}
+
+std::pair<int64_t, int64_t>
+HEScheduler::ESTAndExecTime(const backend::Backend *backend, const ir::OperationIndex &index,
+                            std::multimap<int64_t, int64_t> &transfer_st_exec_time)
+{
+  const bool is_linear_exec = "Linear" == util::getConfigString(util::config::EXECUTOR);
+  const bool is_parallel_exec = "Parallel" == util::getConfigString(util::config::EXECUTOR);
+  // Permutation will cause creating a separate op_seq that contains just this permutation node.
+  // This isn't needed for Linear executor since it doesn't use subgraphs
+  // Number 1 ms is picked experimentally
+  int64_t permute_fine = 1000;
+  // Multiply cpu operations' exec time by 2 because in parallel executor it might be busy with
+  // permutation on other branches or non-nnfw specific tasks and have to wait for it.
+  // Number 2 is picked experimentally
+  const int64_t CPU_DELAY = 2;
+  const auto &node = _graph->operations().at(index);
+  const bool quant = isQuant(*_graph, node);
+  const auto size = getOperationsFlattenedIOSize(*_graph, node);
+  // if this node can be part of a op_seq, then assigning different backend will cause creating
+  // another op_seq
+  if (isMergeable(*_graph, node))
+  {
+    permute_fine *= 2;
+  }
+  if (isWorkaroundSkip(*_graph, backend, node, quant))
+  {
+    return {_exec_time->getMax(), _exec_time->getMax()};
+  }
+  // get average exec time of the op on this backend
+  auto exec_time = getOpTime(backend, node.name(), quant, size);
+  if (backend->config()->id() == "cpu" && is_parallel_exec)
+  {
+    exec_time *= CPU_DELAY;
+  }
+
+  // get max eft of direct (one level above) predecessors
+  auto max_pred_eft = predMaxEFT(backend, node, transfer_st_exec_time);
+
+  int64_t total_transfer_cost = 0;
+  std::vector<std::multimap<int64_t, int64_t>::iterator> inserted_permutations;
+  // Find free time for data transferring and insert it into backend taskset. This is needed:
+  //  1. Time for multiple permutations for this node's input is found correctly
+  //  2. If backend==cpu, then free time for this node must come after permutations
+  for (auto &it : transfer_st_exec_time)
+  {
+    if (is_parallel_exec)
+    {
+      it.second *= CPU_DELAY;
+    }
+    if (!is_linear_exec)
+    {
+      it.second += permute_fine;
+    }
+    total_transfer_cost += it.second;
+
+    const auto prev_op_ft = backendAvailableTime(_cpu_backend, it.first, it.second);
+
+    max_pred_eft = std::max(max_pred_eft, prev_op_ft + it.second);
+
+    const auto tmp = _backends_avail_time[_cpu_backend].emplace(prev_op_ft + it.second, prev_op_ft);
+    inserted_permutations.push_back(tmp.first);
+  }
+  // find the hole/gap, where this op can be put or the finishing time of the last assigned op
+  auto prev_op_ft = backendAvailableTime(backend, max_pred_eft, exec_time);
+
+  // Remove inserted permutation from cpu's task set
+  for (const auto &it : inserted_permutations)
+  {
+    _backends_avail_time[_cpu_backend].erase(it);
+  }
+
+  /* In case non-parallel executor measure just exec time and data transfer time
+   * because EFT(prev_op_ft) is the same for all backends. Since two operations
+   * can't be run simultaneously, finish of running operation must be waited for.
+   * When an operation starts, all backends are free. So, they need time just for
+   * data transfer.*/
+  if (!is_parallel_exec)
+  {
+    VERBOSE(HEScheduler::ESTAndExecTime)
+        << "exec_time of (" << index.value() << ") " << node.name() << " quant==" << quant << " on "
+        << backend->config()->id() << " is " << exec_time
+        << " microseconds. Data transfer cost: " << total_transfer_cost << std::endl;
+
+    return {total_transfer_cost, exec_time};
+  }
+  VERBOSE(HEScheduler::ESTAndExecTime)
+      << "exec_time of (" << index.value() << ") " << node.name() << " quant==" << quant << " on "
+      << backend->config()->id() << ": " << exec_time
+      << " microseconds. Backend available time: " << prev_op_ft
+      << " Parent's max eft: " << max_pred_eft - total_transfer_cost
+      << " data transfer cost: " << total_transfer_cost << std::endl;
+
+  return {prev_op_ft, exec_time};
+}
+
+int64_t HEScheduler::predMaxEFT(const backend::Backend *backend, const ir::Operation &node,
+                                std::multimap<int64_t, int64_t> &transfer_st_exec_time)
+{
+  int64_t max_pred_eft = 0;
+  for (const auto &input_operand_idx : node.getInputs())
+  {
+    const auto &input_operand = _graph->operands().at(input_operand_idx);
+    const bool quant = input_operand.typeInfo().type() == ir::DataType::QUANT8_ASYMM;
+
+    for (const auto &input_node_idx : input_operand.getDef().list())
+    {
+      // Data transfer cost from parent's node backend to current node's backend:
+      auto parent_backend = _backend_resolver->getBackend(input_node_idx);
+
+      max_pred_eft = std::max(max_pred_eft, _ops_eft.at(input_node_idx));
+      if (parent_backend != backend)
+      {
+        // Multiply operand size by 2 because size must describe input+output size
+        int64_t transfer_cost =
+            getPermuteTime(parent_backend, backend, quant, input_operand.info().total_size() * 2);
+        transfer_st_exec_time.emplace(_ops_eft.at(input_node_idx), transfer_cost);
+      }
+    }
+  }
+  return max_pred_eft;
+}
+
+} // namespace compiler
+
+} // namespace neurun