Add cost into profile observer (#12793)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/12793 Add analytical cost into profile observer. It includes the op level cost information for each op run and net level aggregated cost information for each op type. It outputs the following information: 1. analytical flops 2. analytical bytes_read 3. analytical bytes_written Example output at op level: ```I1017 14:58:14.245978 3686541 profile_observer_gpu.cc:26] --------- Starting operator FC op#24 --------- I1017 14:58:14.246049 3686541 profile_observer_gpu.cc:33] Input 0: Tensor model1/embedded_encoder_inputs of type float. Dims: (17,1,256,): I1017 14:58:14.246109 3686541 profile_observer_gpu.cc:33] Input 1: Tensor model1/encoder/layer0/fw/milstm/i2h_w of type float. Dims: (2048,256,): I1017 14:58:14.246176 3686541 profile_observer_gpu.cc:33] Input 2: Tensor model1/encoder/layer0/fw/milstm/i2h_b of type float. Dims: (2048,): I1017 14:58:14.246217 3686541 profile_observer_gpu.cc:44] Argument 0: name: "use_cudnn" i: 1 I1017 14:58:14.246271 3686541 profile_observer_gpu.cc:44] Argument 1: name: "cudnn_exhaustive_search" i: 0 I1017 14:58:14.246338 3686541 profile_observer_gpu.cc:44] Argument 2: name: "order" s: "NHWC" I1017 14:58:14.246372 3686541 profile_observer_gpu.cc:44] Argument 3: name: "axis" i: 2 I1017 14:58:14.246418 3686541 profile_observer_gpu.cc:44] Argument 4: name: "quantization_scheme" i: 1 I1017 14:58:14.246470 3686541 profile_observer_gpu.cc:53] Output 0: Tensor model1/encoder/layer0/fw/milstm/i2h of type float. Dims: (17,1,2048,): I1017 14:58:14.246596 3686541 profile_observer_gpu.cc:61] Cost (flops, bytes_read, bytes_written): I1017 14:58:14.246649 3686541 profile_observer_gpu.cc:62] 17860608 2122752 139264 I1017 14:58:14.246677 3686541 profile_observer_gpu.cc:64] --------- Finished operator FC in 0.764221 ms --------- ``` Example output at net level: ``` I1017 11:13:44.675585 3146691 profile_observer_gpu.cc:165] ================ Detailed stats for net model0/encoder/layer0/bw/milstm ================ I1017 11:13:44.675662 3146691 profile_observer_gpu.cc:167] Cost (flops, bytes_read, bytes_written) per operator type: I1017 11:13:44.675706 3146691 profile_observer_gpu.cc:169] 20992000 42045440 81920 FC I1017 11:13:44.675745 3146691 profile_observer_gpu.cc:169] 20480 163840 81920 Mul I1017 11:13:44.675824 3146691 profile_observer_gpu.cc:169] 20480 163840 81920 Sum I1017 11:13:44.675878 3146691 profile_observer_gpu.cc:169] 0 0 0 ElementwiseLinear I1017 11:13:44.675909 3146691 profile_observer_gpu.cc:169] 0 0 0 LSTMUnit I1017 11:13:44.675958 3146691 profile_observer_gpu.cc:169] 0 0 0 rnn_internal_apply_link ``` Reviewed By: mdschatz Differential Revision: D10428917 fbshipit-source-id: 7c100e551bdd3ac8d7c09be12c72d70a2d67cae1
author: Haixin Liu <haixin@fb.com> 2018-11-16 18:30:49 -0800
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2018-11-16 18:53:28 -0800
commit: 86dc3ab252c8b7e14a5c4229bcd41b790282cd94 (patch)
tree: 4619714e24154b32868a8394ef184b68f66aa979 /caffe2/observers
parent: a1fa9d8cf9b2b0e7373ec420c2487d4dfd0e587c (diff)
download: pytorch-86dc3ab252c8b7e14a5c4229bcd41b790282cd94.tar.gz
pytorch-86dc3ab252c8b7e14a5c4229bcd41b790282cd94.tar.bz2
pytorch-86dc3ab252c8b7e14a5c4229bcd41b790282cd94.zip
3 files changed, 188 insertions, 13 deletions
diff --git a/caffe2/observers/profile_observer.h b/caffe2/observers/profile_observer.h
index ab110da088..b0cd04df77 100644
--- a/caffe2/observers/profile_observer.h
+++ b/caffe2/observers/profile_observer.h
@@ -31,9 +31,11 @@ namespace caffe2 {
 
 /**
  * This observer displays a description of each operator executed in a network.
- * This includes input and tensors (name, size, type), arguments, and execution
- * time. This can be used to analyze different performance characteristics.
- * NOTE: Currently this observer only supports synchronized computation
+ * This includes input and tensors (name, size, type), arguments, analytical
+ *cost and execution time. This can be used to analyze different performance
+ *characteristics. NOTE: Currently this observer only supports synchronized
+ *computation. And for RNN, --caffe2_rnn_executor=False need to be set if want
+ *to get the cost summary at the net level.
  **/
 
 class ProfileObserver;
@@ -52,21 +54,30 @@ class ProfileCounter {
 class ProfileOperatorObserver : public ProfileCounter,
                                 public ObserverBase<OperatorBase> {
  public:
+  struct DetailedStat {
+    string opType;
+    struct OpSchema::Cost c;
+  };
   explicit ProfileOperatorObserver(OperatorBase* subject) = delete;
   explicit ProfileOperatorObserver(
       OperatorBase* subject,
+      DetailedStat* stat,
       ProfileObserver* netObserver)
-      : ObserverBase<OperatorBase>(subject), netObserver_(netObserver) {
+      : ObserverBase<OperatorBase>(subject),
+        stat_(stat),
+        netObserver_(netObserver) {
+    stat->opType = subject->debug_def().type();
     if (subject) {
       net_position_ = subject->net_position();
     }
   }
   explicit ProfileOperatorObserver(
       OperatorBase* subject,
+      DetailedStat* stat,
       ProfileObserver* netObserver,
       int net_position,
       int rnn_order)
-      : ProfileOperatorObserver(subject, netObserver) {
+      : ProfileOperatorObserver(subject, stat, netObserver) {
     net_position_ = net_position;
     rnn_order_ = rnn_order;
   }
@@ -86,7 +97,37 @@ class ProfileOperatorObserver : public ProfileCounter,
     return ss.str();
   }
 
+  OpSchema::Cost getOpCost() {
+    const string& op_type = subject_->debug_def().type();
+    auto* schema = OpSchemaRegistry::Schema(op_type);
+    OpSchema::Cost cost;
+    if (schema && schema->HasCostInferenceFunction()) {
+      vector<TensorShape> shapes = subject_->InputTensorShapes();
+
+      auto known_shapes = std::accumulate(
+          shapes.begin(),
+          shapes.end(),
+          true,
+          [](bool acc, const TensorShape& shape) {
+            return acc && !shape.unknown_shape();
+          });
+      if (known_shapes) {
+        cost = schema->InferCost(subject_->debug_def(), shapes);
+      }
+    }
+    return cost;
+  }
+
+  void updateDetailedStat(const OpSchema::Cost cost) {
+    stat_->c.flops += cost.flops;
+    stat_->c.bytes_read += cost.bytes_read;
+    stat_->c.bytes_written += cost.bytes_written;
+    stat_->c.params_bytes += cost.params_bytes;
+  }
+
  protected:
+  DetailedStat* stat_;
+  OpSchema::Cost cost_;
   ProfileObserver* netObserver_;
   int net_position_; // Needed because this is not visible in RNN Executor
   int rnn_order_ = OperatorBase::kNoNetPositionSet;
@@ -96,20 +137,41 @@ class ProfileOperatorObserver : public ProfileCounter,
   void Stop() override;
 };
 
-class ProfileObserver final : public OperatorAttachingNetObserver<
-                                  ProfileOperatorObserver,
-                                  ProfileObserver> {
+class ProfileObserver final : public ObserverBase<NetBase> {
  public:
   explicit ProfileObserver(NetBase* subject)
-      : OperatorAttachingNetObserver<ProfileOperatorObserver, ProfileObserver>(
-            subject,
-            this) {}
+      : ObserverBase<NetBase>(subject),
+        detailedOpStats_(subject->GetOperators().size()),
+        net_name_(subject->Name()) {
+    const auto& ops = subject->GetOperators();
+    for (int i = 0; i < ops.size(); i++) {
+      ops[i]->AttachObserver(caffe2::make_unique<ProfileOperatorObserver>(
+          ops[i], &detailedOpStats_[i], this));
+    }
+  }
+  ~ProfileObserver();
+  CaffeMap<string, OpSchema::Cost> getAggregatedOpTypeCost() const {
+    CaffeMap<string, OpSchema::Cost> cost_per_op_type;
+    for (int idx = 0; idx < detailedOpStats_.size(); ++idx) {
+      const auto& stat = detailedOpStats_[idx];
+      uint64_t flops = stat.c.flops;
+      uint64_t bytes_read = stat.c.bytes_read;
+      uint64_t bytes_written = stat.c.bytes_written;
+
+      cost_per_op_type[stat.opType].flops += flops;
+      cost_per_op_type[stat.opType].bytes_read += bytes_read;
+      cost_per_op_type[stat.opType].bytes_written += bytes_written;
+    }
+    return cost_per_op_type;
+  }
 
   void Start() override{};
   void Stop() override{};
 
  private:
   vector<const ProfileOperatorObserver*> operator_observers_;
+  std::vector<ProfileOperatorObserver::DetailedStat> detailedOpStats_;
+  std::string net_name_;
 };
 
 } // namespace caffe2
diff --git a/caffe2/observers/profile_observer_gpu.cc b/caffe2/observers/profile_observer_gpu.cc
index 5bd9b0a11b..518801a895 100644
--- a/caffe2/observers/profile_observer_gpu.cc
+++ b/caffe2/observers/profile_observer_gpu.cc
@@ -58,7 +58,10 @@ void ProfileOperatorObserver::Dump() const {
       LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor);
     }
   }
-
+  LOG(INFO) << "Cost (flops, bytes_read, bytes_written, op_type):";
+  LOG(INFO) << std::setw(15) << std::setfill(' ') << cost_.flops << " "
+            << cost_.bytes_read << " " << cost_.bytes_written << " "
+            << subject_->debug_def().type();
   LOG(INFO) << "--------- Finished operator " << subject_->debug_def().type()
             << " in " << run_time_ << " ms ---------";
 }
@@ -82,6 +85,9 @@ void ProfileOperatorObserver::Start() {
     }
   } else {
     start_time_ = timer_.MilliSeconds();
+
+    cost_ = getOpCost();
+    updateDetailedStat(cost_);
   }
 }
 
@@ -118,6 +124,32 @@ std::unique_ptr<ObserverBase<OperatorBase>> ProfileOperatorObserver::rnnCopy(
     int rnn_order) const {
   return std::unique_ptr<ObserverBase<OperatorBase>>(
       new ProfileOperatorObserver(
-          subject, netObserver_, net_position_, rnn_order));
+          subject, stat_, netObserver_, net_position_, rnn_order));
 }
+
+ProfileObserver::~ProfileObserver() {
+  static std::mutex loggingMutex;
+  std::lock_guard<std::mutex> lock(loggingMutex);
+
+  CaffeMap<string, OpSchema::Cost> cost_per_op_type = getAggregatedOpTypeCost();
+  // sort by decreasing flops.
+  std::vector<std::pair<std::string, OpSchema::Cost>> cost_per_op_type_vec(
+      cost_per_op_type.begin(), cost_per_op_type.end());
+  std::sort(
+      cost_per_op_type_vec.begin(),
+      cost_per_op_type_vec.end(),
+      [](const std::pair<std::string, OpSchema::Cost>& left,
+         const std::pair<std::string, OpSchema::Cost>& right) {
+        return left.second.flops > right.second.flops;
+      });
+  LOG(INFO) << "================ Detailed stats for net " << net_name_
+            << " ================";
+  LOG(INFO) << "Aggregated Cost (flops, bytes_read, bytes_written, op_type):";
+  for (const auto& item : cost_per_op_type_vec) {
+    LOG(INFO) << std::setw(15) << std::setfill(' ') << item.second.flops << " "
+              << item.second.bytes_read << " " << item.second.bytes_written
+              << " " << item.first;
+  }
+}
+
 } // namespace caffe2
diff --git a/caffe2/observers/profile_observer_test.cc b/caffe2/observers/profile_observer_test.cc
new file mode 100644
index 0000000000..6a98695cec
--- /dev/null
+++ b/caffe2/observers/profile_observer_test.cc
@@ -0,0 +1,81 @@
+#include "caffe2/core/common.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/observer.h"
+#include "caffe2/core/operator.h"
+#include "profile_observer.h"
+
+#include <gtest/gtest.h>
+#include "caffe2/utils/proto_utils.h"
+
+namespace caffe2 {
+
+namespace {
+
+OperatorDef* add_op(
+    const vector<string>& input,
+    const vector<string>& output,
+    const string& type,
+    NetDef* net) {
+  CHECK(net);
+  auto& op = *net->add_op();
+  op.set_type(type);
+  for (const auto& in : input) {
+    op.add_input(in);
+  }
+  for (const auto& out : output) {
+    op.add_output(out);
+  }
+
+  return net->mutable_op(net->op_size() - 1);
+}
+
+void fill_tensor(
+    const vector<int64_t>& shape,
+    const vector<float>& data,
+    TensorCPU* tensor) {
+  tensor->Resize(shape);
+  CAFFE_ENFORCE_EQ(data.size(), tensor->size());
+  auto ptr = tensor->mutable_data<float>();
+  for (int i = 0; i < tensor->size(); ++i) {
+    ptr[i] = data[i];
+  }
+}
+
+void add_blob(
+    const string& name,
+    const vector<int64_t>& shape,
+    const vector<float>& data,
+    Workspace* ws) {
+  auto* blob = ws->CreateBlob(name);
+  fill_tensor(shape, data, BlobGetMutableTensor(blob, CPU));
+}
+
+} // namespace
+
+TEST(ProfileObserverTest, TestFC) {
+  Workspace ws;
+  auto create_net_def = [&ws](int M, int N, int K) {
+    auto net_def = std::make_shared<NetDef>();
+    net_def->set_name("test");
+    add_op({"X", "W", "b"}, {"Y"}, "FC", net_def.get());
+    add_blob("W", {N, K}, vector<float>(N * K), &ws);
+    add_blob("b", {N}, vector<float>(N), &ws);
+    add_blob("X", {M, K}, vector<float>(M * K), &ws);
+    return net_def;
+  };
+
+  int M = 2, N = 3, K = 4;
+  NetBase* net = ws.CreateNet(create_net_def(M, N, K), true /*overwrite*/);
+  auto net_ob = caffe2::make_unique<ProfileObserver>(net);
+  const auto* ob = net_ob.get();
+  auto* ref = net->AttachObserver(std::move(net_ob));
+  net->Run();
+  CAFFE_ENFORCE(ob);
+  auto cost_per_op_type = ob->getAggregatedOpTypeCost();
+  CAFFE_ENFORCE(cost_per_op_type["FC"].flops == M * N * (2 * K + 1));
+  CAFFE_ENFORCE(
+      cost_per_op_type["FC"].bytes_read == (K * (M + N) + N) * sizeof(float));
+  CAFFE_ENFORCE(cost_per_op_type["FC"].bytes_written == M * N * sizeof(float));
+  net->DetachObserver(ref);
+}
+} // namespace caffe2
author	Haixin Liu <haixin@fb.com>	2018-11-16 18:30:49 -0800
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2018-11-16 18:53:28 -0800
commit	86dc3ab252c8b7e14a5c4229bcd41b790282cd94 (patch)
tree	4619714e24154b32868a8394ef184b68f66aa979 /caffe2/observers
parent	a1fa9d8cf9b2b0e7373ec420c2487d4dfd0e587c (diff)
download	pytorch-86dc3ab252c8b7e14a5c4229bcd41b790282cd94.tar.gz pytorch-86dc3ab252c8b7e14a5c4229bcd41b790282cd94.tar.bz2 pytorch-86dc3ab252c8b7e14a5c4229bcd41b790282cd94.zip