diff options
author | Summer Deng <summerdeng@fb.com> | 2019-03-08 19:00:43 -0800 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-03-08 19:03:54 -0800 |
commit | c10c73f0473baf80305bb800dbee113ff1067a95 (patch) | |
tree | b6b18cbbf1d0c4123ac3faeb9e64e869e9e0d970 /caffe2/quantization | |
parent | 0fd1dc45c037ae806352e08a7b83eeacfad101a0 (diff) | |
download | pytorch-c10c73f0473baf80305bb800dbee113ff1067a95.tar.gz pytorch-c10c73f0473baf80305bb800dbee113ff1067a95.tar.bz2 pytorch-c10c73f0473baf80305bb800dbee113ff1067a95.zip |
Int8 FC performance debugging (#17700)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/17700
Add performance debugging utilities in DNNLOWP FC operator and the python script
Reviewed By: amylittleyang
Differential Revision: D14321299
fbshipit-source-id: 50dbd7b352a1da5d2ecb659d8003e71e70750063
Diffstat (limited to 'caffe2/quantization')
-rw-r--r-- | caffe2/quantization/server/fully_connected_dnnlowp_op.cc | 149 |
1 files changed, 102 insertions, 47 deletions
diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op.cc b/caffe2/quantization/server/fully_connected_dnnlowp_op.cc index c19f43e975..5eee0a9604 100644 --- a/caffe2/quantization/server/fully_connected_dnnlowp_op.cc +++ b/caffe2/quantization/server/fully_connected_dnnlowp_op.cc @@ -1,6 +1,8 @@ #include "fully_connected_dnnlowp_op.h" +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN #include <chrono> +#endif #include "caffe2/core/flags.h" #include "caffe2/core/tensor_int8.h" @@ -91,25 +93,30 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() { return true; } +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN chrono::time_point<chrono::system_clock> t_very_begin, t_begin, t_end; - - if (VLOG_IS_ON(3)) { + /* if (VLOG_IS_ON(3)) */ + { t_begin = chrono::system_clock::now(); t_very_begin = t_begin; } +#endif // Get quantization parameters if (!GetQuantizationParameters_()) { return false; } - if (VLOG_IS_ON(3)) { +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(3)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration<double>(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " get_quant_params: " << dt * 1e3 - << " ms"; + LOG(INFO) << "@PERF this=" << this << " get_quant_params: " << dt * 1e3 + << " ms"; t_begin = chrono::system_clock::now(); } +#endif const auto& X = InputTensorCPU_(0); const auto& W = InputTensorCPU_(1); @@ -130,13 +137,17 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() { const T* Xdata = nullptr; vector<T> X_temp; - if (VLOG_IS_ON(3)) { +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration<double>(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " initialize parameters: " << dt * 1e3 - << " ms"; + LOG(INFO) << "@PERF this=" << this << " initialize parameters: " << dt * 1e3 + << " ms"; t_begin = chrono::system_clock::now(); } +#endif + if (Wq_packed_) { // fast path to use fbgemm using namespace fbgemm; @@ -144,22 +155,31 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() { if (X.template IsType<T>() || !dequantize_output_) { // Only when input and output are float, we don't need input to be // quantized. - if (VLOG_IS_ON(3)) { - t_begin = chrono::system_clock::now(); - } + +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_begin = chrono::system_clock::now(); } +#endif + Xdata = QuantizeInputIfNeeded<T>(this, 0, in_qparams_[0], X_temp); - if (VLOG_IS_ON(3)) { + +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration<double>(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " input quantization: " << dt * 1e3 - << " ms"; + LOG(INFO) << "@PERF this=" << this + << " input quantization: " << dt * 1e3 << " ms"; t_begin = chrono::system_clock::now(); } +#endif } - if (VLOG_IS_ON(3)) { - t_begin = chrono::system_clock::now(); - } +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_begin = chrono::system_clock::now(); } +#endif + if (!dequantize_output_) { Y_int32_.resize(Y->size()); DoNothing<> doNothingObj{}; @@ -396,17 +416,24 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() { } // dequantize_output } else { // Quantize X - if (VLOG_IS_ON(3)) { - t_begin = chrono::system_clock::now(); - } + +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_begin = chrono::system_clock::now(); } +#endif + Xdata = QuantizeInputIfNeeded<T>(this, 0, in_qparams_[0], X_temp); - if (VLOG_IS_ON(3)) { + +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration<double>(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " input quantization: " << dt * 1e3 - << " ms"; + LOG(INFO) << "@PERF this=" << this << " input quantization: " << dt * 1e3 + << " ms"; t_begin = chrono::system_clock::now(); } +#endif Y_int32_.resize(Y->size()); for (int i = 0; i < M; ++i) { @@ -429,13 +456,16 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() { StoreMatrixInMatrixMarketFormat(N, K, Wdata, this->debug_def().input(1)); } - if (VLOG_IS_ON(3)) { +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration<double>(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " gemm: " << dt * 1e3 << " ms"; + LOG(INFO) << "@PERF this=" << this << " gemm: " << dt * 1e3 << " ms"; t_begin = chrono::system_clock::now(); } +#endif // Adjust with bias and zero_point and then requantize // See batch_matmul_dnnlowp_op.cc to why we compute column_offsets, @@ -495,20 +525,23 @@ bool FullyConnectedDNNLowPOp<T>::RunOnDevice() { MeasureQuantizationError_(); - if (VLOG_IS_ON(3)) { +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration<double>(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this - << " bias-offset-requantization: " << dt * 1e3 << " ms"; + LOG(INFO) << "@PERF this=" << this + << " bias-offset-requantization: " << dt * 1e3 << " ms"; t_end = chrono::system_clock::now(); double ops = 2. * M * N * K; dt = chrono::duration<double>(t_end - t_very_begin).count(); double gops = ops / dt / 1e9; - VLOG(3) << "@PERF this=" << this - << " output=" << this->debug_def().output(0) << " " << M << "x" << N - << "x" << K << ": " << dt * 1e3 << " ms " << gops << " gops"; + LOG(INFO) << "@PERF this=" << this + << " output=" << this->debug_def().output(0) << " " << M << "x" + << N << "x" << K << ": " << dt * 1e3 << " ms " << gops << " gops"; } +#endif return true; } @@ -517,20 +550,25 @@ template <typename T> bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() { using namespace dnnlowp; +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN chrono::time_point<chrono::system_clock> t_begin, t_end; - if (VLOG_IS_ON(3)) { - t_begin = chrono::system_clock::now(); - } + /* if (VLOG_IS_ON(1)) */ + { t_begin = chrono::system_clock::now(); } +#endif + // Choose quantization for X in_qparams_[0] = GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get()); - if (VLOG_IS_ON(3)) { +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration<double>(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " GetInputTensorQuantizationParamsOf " - << dt * 1e3 << " ms"; + LOG(INFO) << "@PERF this=" << this << " GetInputTensorQuantizationParamsOf " + << dt * 1e3 << " ms"; t_begin = chrono::system_clock::now(); } +#endif // Quantize W const auto& X = InputTensorCPU_(0); @@ -618,12 +656,16 @@ bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() { filter_qparams_[0]); } - if (VLOG_IS_ON(3)) { +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration<double>(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " Quantize W " << dt * 1e3 << " ms"; + LOG(INFO) << "@PERF this=" << this << " Quantize W " << dt * 1e3 << " ms"; t_begin = chrono::system_clock::now(); } +#endif + // Pre-compute column_offset // If input tensor doesn't use dynamic quantization, we fold column_offsets_ // into bias. @@ -641,13 +683,17 @@ bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() { K, N, W_quantized_.data(), filter_qparams_, *column_offsets_); } } - if (VLOG_IS_ON(3)) { + +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration<double>(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " Calculate column offset " << dt * 1e3 - << " ms"; + LOG(INFO) << "@PERF this=" << this << " Calculate column offset " + << dt * 1e3 << " ms"; t_begin = chrono::system_clock::now(); } +#endif // Quantize bias if (!is_weight_constant_ || (!b_quantized_data_ && !b_dequantized_data_) || @@ -736,12 +782,16 @@ bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() { vector<T_signed>().swap(W_quantized_); } - if (VLOG_IS_ON(3)) { +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration<double>(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " Quantize bias " << dt * 1e3 << " ms"; + LOG(INFO) << "@PERF this=" << this << " Quantize bias " << dt * 1e3 + << " ms"; t_begin = chrono::system_clock::now(); } +#endif if (!dequantize_output_ && !requantization_param_selected_) { GetOutputQuantizationParams_(); @@ -762,13 +812,18 @@ bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() { Fp32Op_()->Get()->RunOnDevice(); } } - if (VLOG_IS_ON(3)) { + +#ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN + /* if (VLOG_IS_ON(1)) */ + { t_end = chrono::system_clock::now(); double dt = chrono::duration<double>(t_end - t_begin).count(); - VLOG(3) << "@PERF this=" << this << " GetOutputQuantizationParams " - << dt * 1e3 << " ms"; + LOG(INFO) << "@PERF this=" << this << " GetOutputQuantizationParams " + << dt * 1e3 << " ms"; t_begin = chrono::system_clock::now(); } +#endif + return true; } |