Int8 FC performance debugging (#17700)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/17700 Add performance debugging utilities in DNNLOWP FC operator and the python script Reviewed By: amylittleyang Differential Revision: D14321299 fbshipit-source-id: 50dbd7b352a1da5d2ecb659d8003e71e70750063
author: Summer Deng <summerdeng@fb.com> 2019-03-08 19:00:43 -0800
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-03-08 19:03:54 -0800
commit: c10c73f0473baf80305bb800dbee113ff1067a95 (patch)
tree: b6b18cbbf1d0c4123ac3faeb9e64e869e9e0d970 /caffe2/quantization
parent: 0fd1dc45c037ae806352e08a7b83eeacfad101a0 (diff)
download: pytorch-c10c73f0473baf80305bb800dbee113ff1067a95.tar.gz
pytorch-c10c73f0473baf80305bb800dbee113ff1067a95.tar.bz2
pytorch-c10c73f0473baf80305bb800dbee113ff1067a95.zip
1 files changed, 102 insertions, 47 deletions
diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op.cc b/caffe2/quantization/server/fully_connected_dnnlowp_op.cc
index c19f43e975..5eee0a9604 100644
--- a/caffe2/quantization/server/fully_connected_dnnlowp_op.cc
+++ b/caffe2/quantization/server/fully_connected_dnnlowp_op.cc
@@ -1,6 +1,8 @@
 #include "fully_connected_dnnlowp_op.h"
 
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
 #include <chrono>
+#endif
 
 #include "caffe2/core/flags.h"
 #include "caffe2/core/tensor_int8.h"
@@ -91,25 +93,30 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() {
     return true;
   }
 
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
   chrono::time_point<chrono::system_clock> t_very_begin, t_begin, t_end;
-
-  if (VLOG_IS_ON(3)) {
+  /* if (VLOG_IS_ON(3)) */
+  {
     t_begin = chrono::system_clock::now();
     t_very_begin = t_begin;
   }
+#endif
 
   // Get quantization parameters
   if (!GetQuantizationParameters_()) {
     return false;
   }
 
-  if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(3)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this << " get_quant_params: " << dt * 1e3
-            << " ms";
+    LOG(INFO) << "@PERF this=" << this << " get_quant_params: " << dt * 1e3
+              << " ms";
     t_begin = chrono::system_clock::now();
   }
+#endif
 
   const auto& X = InputTensorCPU_(0);
   const auto& W = InputTensorCPU_(1);
@@ -130,13 +137,17 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() {
   const T* Xdata = nullptr;
   vector<T> X_temp;
 
-  if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(1)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this << " initialize parameters: " << dt * 1e3
-            << " ms";
+    LOG(INFO) << "@PERF this=" << this << " initialize parameters: " << dt * 1e3
+              << " ms";
     t_begin = chrono::system_clock::now();
   }
+#endif
+
   if (Wq_packed_) {
     // fast path to use fbgemm
     using namespace fbgemm;
@@ -144,22 +155,31 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() {
     if (X.template IsType<T>() || !dequantize_output_) {
       // Only when input and output are float, we don't need input to be
       // quantized.
-      if (VLOG_IS_ON(3)) {
-        t_begin = chrono::system_clock::now();
-      }
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+      /* if (VLOG_IS_ON(1)) */
+      { t_begin = chrono::system_clock::now(); }
+#endif
+
       Xdata = QuantizeInputIfNeeded<T>(this, 0, in_qparams_[0], X_temp);
-      if (VLOG_IS_ON(3)) {
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+      /* if (VLOG_IS_ON(1)) */
+      {
         t_end = chrono::system_clock::now();
         double dt = chrono::duration<double>(t_end - t_begin).count();
-        VLOG(3) << "@PERF this=" << this << " input quantization: " << dt * 1e3
-                << " ms";
+        LOG(INFO) << "@PERF this=" << this
+                  << " input quantization: " << dt * 1e3 << " ms";
         t_begin = chrono::system_clock::now();
       }
+#endif
     }
 
-    if (VLOG_IS_ON(3)) {
-      t_begin = chrono::system_clock::now();
-    }
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+    /* if (VLOG_IS_ON(1)) */
+    { t_begin = chrono::system_clock::now(); }
+#endif
+
     if (!dequantize_output_) {
       Y_int32_.resize(Y->size());
       DoNothing<> doNothingObj{};
@@ -396,17 +416,24 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() {
     } // dequantize_output
   } else {
     // Quantize X
-    if (VLOG_IS_ON(3)) {
-      t_begin = chrono::system_clock::now();
-    }
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+    /* if (VLOG_IS_ON(1)) */
+    { t_begin = chrono::system_clock::now(); }
+#endif
+
     Xdata = QuantizeInputIfNeeded<T>(this, 0, in_qparams_[0], X_temp);
-    if (VLOG_IS_ON(3)) {
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+    /* if (VLOG_IS_ON(1)) */
+    {
       t_end = chrono::system_clock::now();
       double dt = chrono::duration<double>(t_end - t_begin).count();
-      VLOG(3) << "@PERF this=" << this << " input quantization: " << dt * 1e3
-              << " ms";
+      LOG(INFO) << "@PERF this=" << this << " input quantization: " << dt * 1e3
+                << " ms";
       t_begin = chrono::system_clock::now();
     }
+#endif
 
     Y_int32_.resize(Y->size());
     for (int i = 0; i < M; ++i) {
@@ -429,13 +456,16 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() {
     StoreMatrixInMatrixMarketFormat(N, K, Wdata, this->debug_def().input(1));
   }
 
-  if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(1)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this << " gemm: " << dt * 1e3 << " ms";
+    LOG(INFO) << "@PERF this=" << this << " gemm: " << dt * 1e3 << " ms";
 
     t_begin = chrono::system_clock::now();
   }
+#endif
 
   // Adjust with bias and zero_point and then requantize
   // See batch_matmul_dnnlowp_op.cc to why we compute column_offsets,
@@ -495,20 +525,23 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() {
 
   MeasureQuantizationError_();
 
-  if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(1)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this
-            << " bias-offset-requantization: " << dt * 1e3 << " ms";
+    LOG(INFO) << "@PERF this=" << this
+              << " bias-offset-requantization: " << dt * 1e3 << " ms";
 
     t_end = chrono::system_clock::now();
     double ops = 2. * M * N * K;
     dt = chrono::duration<double>(t_end - t_very_begin).count();
     double gops = ops / dt / 1e9;
-    VLOG(3) << "@PERF this=" << this
-            << " output=" << this->debug_def().output(0) << " " << M << "x" << N
-            << "x" << K << ": " << dt * 1e3 << " ms " << gops << " gops";
+    LOG(INFO) << "@PERF this=" << this
+              << " output=" << this->debug_def().output(0) << " " << M << "x"
+              << N << "x" << K << ": " << dt * 1e3 << " ms " << gops << " gops";
   }
+#endif
 
   return true;
 }
@@ -517,20 +550,25 @@ template <typename T>
 bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() {
   using namespace dnnlowp;
 
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
   chrono::time_point<chrono::system_clock> t_begin, t_end;
-  if (VLOG_IS_ON(3)) {
-    t_begin = chrono::system_clock::now();
-  }
+  /* if (VLOG_IS_ON(1)) */
+  { t_begin = chrono::system_clock::now(); }
+#endif
+
   // Choose quantization for X
   in_qparams_[0] = GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get());
 
-  if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(1)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this << " GetInputTensorQuantizationParamsOf "
-            << dt * 1e3 << " ms";
+    LOG(INFO) << "@PERF this=" << this << " GetInputTensorQuantizationParamsOf "
+              << dt * 1e3 << " ms";
     t_begin = chrono::system_clock::now();
   }
+#endif
 
   // Quantize W
   const auto& X = InputTensorCPU_(0);
@@ -618,12 +656,16 @@ bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() {
         filter_qparams_[0]);
   }
 
-  if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(1)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this << " Quantize W " << dt * 1e3 << " ms";
+    LOG(INFO) << "@PERF this=" << this << " Quantize W " << dt * 1e3 << " ms";
     t_begin = chrono::system_clock::now();
   }
+#endif
+
   // Pre-compute column_offset
   // If input tensor doesn't use dynamic quantization, we fold column_offsets_
   // into bias.
@@ -641,13 +683,17 @@ bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() {
           K, N, W_quantized_.data(), filter_qparams_, *column_offsets_);
     }
   }
-  if (VLOG_IS_ON(3)) {
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(1)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this << " Calculate column offset " << dt * 1e3
-            << " ms";
+    LOG(INFO) << "@PERF this=" << this << " Calculate column offset "
+              << dt * 1e3 << " ms";
     t_begin = chrono::system_clock::now();
   }
+#endif
 
   // Quantize bias
   if (!is_weight_constant_ || (!b_quantized_data_ && !b_dequantized_data_) ||
@@ -736,12 +782,16 @@ bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() {
     vector<T_signed>().swap(W_quantized_);
   }
 
-  if (VLOG_IS_ON(3)) {
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(1)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this << " Quantize bias " << dt * 1e3 << " ms";
+    LOG(INFO) << "@PERF this=" << this << " Quantize bias " << dt * 1e3
+              << " ms";
     t_begin = chrono::system_clock::now();
   }
+#endif
 
   if (!dequantize_output_ && !requantization_param_selected_) {
     GetOutputQuantizationParams_();
@@ -762,13 +812,18 @@ bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() {
       Fp32Op_()->Get()->RunOnDevice();
     }
   }
-  if (VLOG_IS_ON(3)) {
+
+#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
+  /* if (VLOG_IS_ON(1)) */
+  {
     t_end = chrono::system_clock::now();
     double dt = chrono::duration<double>(t_end - t_begin).count();
-    VLOG(3) << "@PERF this=" << this << " GetOutputQuantizationParams "
-            << dt * 1e3 << " ms";
+    LOG(INFO) << "@PERF this=" << this << " GetOutputQuantizationParams "
+              << dt * 1e3 << " ms";
     t_begin = chrono::system_clock::now();
   }
+#endif
+
   return true;
 }
author	Summer Deng <summerdeng@fb.com>	2019-03-08 19:00:43 -0800
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-03-08 19:03:54 -0800
commit	c10c73f0473baf80305bb800dbee113ff1067a95 (patch)
tree	b6b18cbbf1d0c4123ac3faeb9e64e869e9e0d970 /caffe2/quantization
parent	0fd1dc45c037ae806352e08a7b83eeacfad101a0 (diff)
download	pytorch-c10c73f0473baf80305bb800dbee113ff1067a95.tar.gz pytorch-c10c73f0473baf80305bb800dbee113ff1067a95.tar.bz2 pytorch-c10c73f0473baf80305bb800dbee113ff1067a95.zip