diff options
author | Haixin Liu <haixin@fb.com> | 2018-11-16 18:30:49 -0800 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2018-11-16 18:53:28 -0800 |
commit | 86dc3ab252c8b7e14a5c4229bcd41b790282cd94 (patch) | |
tree | 4619714e24154b32868a8394ef184b68f66aa979 /caffe2/observers | |
parent | a1fa9d8cf9b2b0e7373ec420c2487d4dfd0e587c (diff) | |
download | pytorch-86dc3ab252c8b7e14a5c4229bcd41b790282cd94.tar.gz pytorch-86dc3ab252c8b7e14a5c4229bcd41b790282cd94.tar.bz2 pytorch-86dc3ab252c8b7e14a5c4229bcd41b790282cd94.zip |
Add cost into profile observer (#12793)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12793
Add analytical cost into profile observer. It includes the op level cost information for each op run and net level aggregated cost information for each op type.
It outputs the following information:
1. analytical flops
2. analytical bytes_read
3. analytical bytes_written
Example output at op level:
```I1017 14:58:14.245978 3686541 profile_observer_gpu.cc:26] --------- Starting operator FC op#24 ---------
I1017 14:58:14.246049 3686541 profile_observer_gpu.cc:33] Input 0: Tensor model1/embedded_encoder_inputs of type float. Dims: (17,1,256,):
I1017 14:58:14.246109 3686541 profile_observer_gpu.cc:33] Input 1: Tensor model1/encoder/layer0/fw/milstm/i2h_w of type float. Dims: (2048,256,):
I1017 14:58:14.246176 3686541 profile_observer_gpu.cc:33] Input 2: Tensor model1/encoder/layer0/fw/milstm/i2h_b of type float. Dims: (2048,):
I1017 14:58:14.246217 3686541 profile_observer_gpu.cc:44] Argument 0: name: "use_cudnn" i: 1
I1017 14:58:14.246271 3686541 profile_observer_gpu.cc:44] Argument 1: name: "cudnn_exhaustive_search" i: 0
I1017 14:58:14.246338 3686541 profile_observer_gpu.cc:44] Argument 2: name: "order" s: "NHWC"
I1017 14:58:14.246372 3686541 profile_observer_gpu.cc:44] Argument 3: name: "axis" i: 2
I1017 14:58:14.246418 3686541 profile_observer_gpu.cc:44] Argument 4: name: "quantization_scheme" i: 1
I1017 14:58:14.246470 3686541 profile_observer_gpu.cc:53] Output 0: Tensor model1/encoder/layer0/fw/milstm/i2h of type float. Dims: (17,1,2048,):
I1017 14:58:14.246596 3686541 profile_observer_gpu.cc:61] Cost (flops, bytes_read, bytes_written):
I1017 14:58:14.246649 3686541 profile_observer_gpu.cc:62] 17860608 2122752 139264
I1017 14:58:14.246677 3686541 profile_observer_gpu.cc:64] --------- Finished operator FC in 0.764221 ms ---------
```
Example output at net level:
```
I1017 11:13:44.675585 3146691 profile_observer_gpu.cc:165] ================ Detailed stats for net model0/encoder/layer0/bw/milstm ================
I1017 11:13:44.675662 3146691 profile_observer_gpu.cc:167] Cost (flops, bytes_read, bytes_written) per operator type:
I1017 11:13:44.675706 3146691 profile_observer_gpu.cc:169] 20992000 42045440 81920 FC
I1017 11:13:44.675745 3146691 profile_observer_gpu.cc:169] 20480 163840 81920 Mul
I1017 11:13:44.675824 3146691 profile_observer_gpu.cc:169] 20480 163840 81920 Sum
I1017 11:13:44.675878 3146691 profile_observer_gpu.cc:169] 0 0 0 ElementwiseLinear
I1017 11:13:44.675909 3146691 profile_observer_gpu.cc:169] 0 0 0 LSTMUnit
I1017 11:13:44.675958 3146691 profile_observer_gpu.cc:169] 0 0 0 rnn_internal_apply_link
```
Reviewed By: mdschatz
Differential Revision: D10428917
fbshipit-source-id: 7c100e551bdd3ac8d7c09be12c72d70a2d67cae1
Diffstat (limited to 'caffe2/observers')
-rw-r--r-- | caffe2/observers/profile_observer.h | 84 | ||||
-rw-r--r-- | caffe2/observers/profile_observer_gpu.cc | 36 | ||||
-rw-r--r-- | caffe2/observers/profile_observer_test.cc | 81 |
3 files changed, 188 insertions, 13 deletions
diff --git a/caffe2/observers/profile_observer.h b/caffe2/observers/profile_observer.h index ab110da088..b0cd04df77 100644 --- a/caffe2/observers/profile_observer.h +++ b/caffe2/observers/profile_observer.h @@ -31,9 +31,11 @@ namespace caffe2 { /** * This observer displays a description of each operator executed in a network. - * This includes input and tensors (name, size, type), arguments, and execution - * time. This can be used to analyze different performance characteristics. - * NOTE: Currently this observer only supports synchronized computation + * This includes input and tensors (name, size, type), arguments, analytical + *cost and execution time. This can be used to analyze different performance + *characteristics. NOTE: Currently this observer only supports synchronized + *computation. And for RNN, --caffe2_rnn_executor=False need to be set if want + *to get the cost summary at the net level. **/ class ProfileObserver; @@ -52,21 +54,30 @@ class ProfileCounter { class ProfileOperatorObserver : public ProfileCounter, public ObserverBase<OperatorBase> { public: + struct DetailedStat { + string opType; + struct OpSchema::Cost c; + }; explicit ProfileOperatorObserver(OperatorBase* subject) = delete; explicit ProfileOperatorObserver( OperatorBase* subject, + DetailedStat* stat, ProfileObserver* netObserver) - : ObserverBase<OperatorBase>(subject), netObserver_(netObserver) { + : ObserverBase<OperatorBase>(subject), + stat_(stat), + netObserver_(netObserver) { + stat->opType = subject->debug_def().type(); if (subject) { net_position_ = subject->net_position(); } } explicit ProfileOperatorObserver( OperatorBase* subject, + DetailedStat* stat, ProfileObserver* netObserver, int net_position, int rnn_order) - : ProfileOperatorObserver(subject, netObserver) { + : ProfileOperatorObserver(subject, stat, netObserver) { net_position_ = net_position; rnn_order_ = rnn_order; } @@ -86,7 +97,37 @@ class ProfileOperatorObserver : public ProfileCounter, return ss.str(); } + OpSchema::Cost getOpCost() { + const string& op_type = subject_->debug_def().type(); + auto* schema = OpSchemaRegistry::Schema(op_type); + OpSchema::Cost cost; + if (schema && schema->HasCostInferenceFunction()) { + vector<TensorShape> shapes = subject_->InputTensorShapes(); + + auto known_shapes = std::accumulate( + shapes.begin(), + shapes.end(), + true, + [](bool acc, const TensorShape& shape) { + return acc && !shape.unknown_shape(); + }); + if (known_shapes) { + cost = schema->InferCost(subject_->debug_def(), shapes); + } + } + return cost; + } + + void updateDetailedStat(const OpSchema::Cost cost) { + stat_->c.flops += cost.flops; + stat_->c.bytes_read += cost.bytes_read; + stat_->c.bytes_written += cost.bytes_written; + stat_->c.params_bytes += cost.params_bytes; + } + protected: + DetailedStat* stat_; + OpSchema::Cost cost_; ProfileObserver* netObserver_; int net_position_; // Needed because this is not visible in RNN Executor int rnn_order_ = OperatorBase::kNoNetPositionSet; @@ -96,20 +137,41 @@ class ProfileOperatorObserver : public ProfileCounter, void Stop() override; }; -class ProfileObserver final : public OperatorAttachingNetObserver< - ProfileOperatorObserver, - ProfileObserver> { +class ProfileObserver final : public ObserverBase<NetBase> { public: explicit ProfileObserver(NetBase* subject) - : OperatorAttachingNetObserver<ProfileOperatorObserver, ProfileObserver>( - subject, - this) {} + : ObserverBase<NetBase>(subject), + detailedOpStats_(subject->GetOperators().size()), + net_name_(subject->Name()) { + const auto& ops = subject->GetOperators(); + for (int i = 0; i < ops.size(); i++) { + ops[i]->AttachObserver(caffe2::make_unique<ProfileOperatorObserver>( + ops[i], &detailedOpStats_[i], this)); + } + } + ~ProfileObserver(); + CaffeMap<string, OpSchema::Cost> getAggregatedOpTypeCost() const { + CaffeMap<string, OpSchema::Cost> cost_per_op_type; + for (int idx = 0; idx < detailedOpStats_.size(); ++idx) { + const auto& stat = detailedOpStats_[idx]; + uint64_t flops = stat.c.flops; + uint64_t bytes_read = stat.c.bytes_read; + uint64_t bytes_written = stat.c.bytes_written; + + cost_per_op_type[stat.opType].flops += flops; + cost_per_op_type[stat.opType].bytes_read += bytes_read; + cost_per_op_type[stat.opType].bytes_written += bytes_written; + } + return cost_per_op_type; + } void Start() override{}; void Stop() override{}; private: vector<const ProfileOperatorObserver*> operator_observers_; + std::vector<ProfileOperatorObserver::DetailedStat> detailedOpStats_; + std::string net_name_; }; } // namespace caffe2 diff --git a/caffe2/observers/profile_observer_gpu.cc b/caffe2/observers/profile_observer_gpu.cc index 5bd9b0a11b..518801a895 100644 --- a/caffe2/observers/profile_observer_gpu.cc +++ b/caffe2/observers/profile_observer_gpu.cc @@ -58,7 +58,10 @@ void ProfileOperatorObserver::Dump() const { LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor); } } - + LOG(INFO) << "Cost (flops, bytes_read, bytes_written, op_type):"; + LOG(INFO) << std::setw(15) << std::setfill(' ') << cost_.flops << " " + << cost_.bytes_read << " " << cost_.bytes_written << " " + << subject_->debug_def().type(); LOG(INFO) << "--------- Finished operator " << subject_->debug_def().type() << " in " << run_time_ << " ms ---------"; } @@ -82,6 +85,9 @@ void ProfileOperatorObserver::Start() { } } else { start_time_ = timer_.MilliSeconds(); + + cost_ = getOpCost(); + updateDetailedStat(cost_); } } @@ -118,6 +124,32 @@ std::unique_ptr<ObserverBase<OperatorBase>> ProfileOperatorObserver::rnnCopy( int rnn_order) const { return std::unique_ptr<ObserverBase<OperatorBase>>( new ProfileOperatorObserver( - subject, netObserver_, net_position_, rnn_order)); + subject, stat_, netObserver_, net_position_, rnn_order)); } + +ProfileObserver::~ProfileObserver() { + static std::mutex loggingMutex; + std::lock_guard<std::mutex> lock(loggingMutex); + + CaffeMap<string, OpSchema::Cost> cost_per_op_type = getAggregatedOpTypeCost(); + // sort by decreasing flops. + std::vector<std::pair<std::string, OpSchema::Cost>> cost_per_op_type_vec( + cost_per_op_type.begin(), cost_per_op_type.end()); + std::sort( + cost_per_op_type_vec.begin(), + cost_per_op_type_vec.end(), + [](const std::pair<std::string, OpSchema::Cost>& left, + const std::pair<std::string, OpSchema::Cost>& right) { + return left.second.flops > right.second.flops; + }); + LOG(INFO) << "================ Detailed stats for net " << net_name_ + << " ================"; + LOG(INFO) << "Aggregated Cost (flops, bytes_read, bytes_written, op_type):"; + for (const auto& item : cost_per_op_type_vec) { + LOG(INFO) << std::setw(15) << std::setfill(' ') << item.second.flops << " " + << item.second.bytes_read << " " << item.second.bytes_written + << " " << item.first; + } +} + } // namespace caffe2 diff --git a/caffe2/observers/profile_observer_test.cc b/caffe2/observers/profile_observer_test.cc new file mode 100644 index 0000000000..6a98695cec --- /dev/null +++ b/caffe2/observers/profile_observer_test.cc @@ -0,0 +1,81 @@ +#include "caffe2/core/common.h" +#include "caffe2/core/net.h" +#include "caffe2/core/observer.h" +#include "caffe2/core/operator.h" +#include "profile_observer.h" + +#include <gtest/gtest.h> +#include "caffe2/utils/proto_utils.h" + +namespace caffe2 { + +namespace { + +OperatorDef* add_op( + const vector<string>& input, + const vector<string>& output, + const string& type, + NetDef* net) { + CHECK(net); + auto& op = *net->add_op(); + op.set_type(type); + for (const auto& in : input) { + op.add_input(in); + } + for (const auto& out : output) { + op.add_output(out); + } + + return net->mutable_op(net->op_size() - 1); +} + +void fill_tensor( + const vector<int64_t>& shape, + const vector<float>& data, + TensorCPU* tensor) { + tensor->Resize(shape); + CAFFE_ENFORCE_EQ(data.size(), tensor->size()); + auto ptr = tensor->mutable_data<float>(); + for (int i = 0; i < tensor->size(); ++i) { + ptr[i] = data[i]; + } +} + +void add_blob( + const string& name, + const vector<int64_t>& shape, + const vector<float>& data, + Workspace* ws) { + auto* blob = ws->CreateBlob(name); + fill_tensor(shape, data, BlobGetMutableTensor(blob, CPU)); +} + +} // namespace + +TEST(ProfileObserverTest, TestFC) { + Workspace ws; + auto create_net_def = [&ws](int M, int N, int K) { + auto net_def = std::make_shared<NetDef>(); + net_def->set_name("test"); + add_op({"X", "W", "b"}, {"Y"}, "FC", net_def.get()); + add_blob("W", {N, K}, vector<float>(N * K), &ws); + add_blob("b", {N}, vector<float>(N), &ws); + add_blob("X", {M, K}, vector<float>(M * K), &ws); + return net_def; + }; + + int M = 2, N = 3, K = 4; + NetBase* net = ws.CreateNet(create_net_def(M, N, K), true /*overwrite*/); + auto net_ob = caffe2::make_unique<ProfileObserver>(net); + const auto* ob = net_ob.get(); + auto* ref = net->AttachObserver(std::move(net_ob)); + net->Run(); + CAFFE_ENFORCE(ob); + auto cost_per_op_type = ob->getAggregatedOpTypeCost(); + CAFFE_ENFORCE(cost_per_op_type["FC"].flops == M * N * (2 * K + 1)); + CAFFE_ENFORCE( + cost_per_op_type["FC"].bytes_read == (K * (M + N) + N) * sizeof(float)); + CAFFE_ENFORCE(cost_per_op_type["FC"].bytes_written == M * N * sizeof(float)); + net->DetachObserver(ref); +} +} // namespace caffe2 |