skip ncclCommDestroy if CUDA driver is already unloaded

author: soumith <soumith@fb.com> 2017-10-13 08:50:00 -0700
committer: soumith <soumith@fb.com> 2017-10-13 08:50:00 -0700
commit: 5a96037810eb043235d53d9f9200867500259f65 (patch)
tree: 95a80984895cd1d4061719dd8331917720446333 /torch/csrc/cuda
parent: 8f26d6aabcad991da88b663467ee2080a38631f7 (diff)
download: pytorch-5a96037810eb043235d53d9f9200867500259f65.tar.gz
pytorch-5a96037810eb043235d53d9f9200867500259f65.tar.bz2
pytorch-5a96037810eb043235d53d9f9200867500259f65.zip
1 files changed, 9 insertions, 2 deletions
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index a035495156..0ae936dcdd 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -28,6 +28,13 @@ struct NcclCommList {
   ~NcclCommList() {
     if (comms) {
       for (int i = 0; i < ndevices; i++) {
+        int dummy_var;
+        if (cudaGetDevice(&dummy_var) != cudaSuccess) {
+          /* there are cases when this destructor is called after the
+           CUDA driver is already unloaded from the process.
+           In these cases, skip ncclCommDestroy */
+          return;
+        }
 	ncclCommDestroy(comms[i]);
       }
     }
@@ -107,7 +114,7 @@ static void _check_inputs(std::vector<at::Tensor> &inputs, std::vector<at::Tenso
     if (input.numel() != numel) {
       throw std::runtime_error("all inputs must have the same number of elements");
     }
-  
+
     if (output.numel() * output_multiplier != numel * input_multiplier) {
       throw std::runtime_error("output must be of size input_size * size_multiplier");
     }
@@ -144,7 +151,7 @@ PyObject * THCPModule_nccl_reduce(PyObject *self, PyObject *args) {
   std::vector<THCStream*> streams = THPUtils_PySequence_to_THCStreamList(_streams);
 
   THPUtils_assert(inputs.size() == streams.size(), "number of streams is not equal to number of inputs");
-  
+
   // we can safely release GIL after this line, no python API used
   AutoNoGIL no_gil;
   _check_inputs(inputs, outputs, 1, 1);
author	soumith <soumith@fb.com>	2017-10-13 08:50:00 -0700
committer	soumith <soumith@fb.com>	2017-10-13 08:50:00 -0700
commit	5a96037810eb043235d53d9f9200867500259f65 (patch)
tree	95a80984895cd1d4061719dd8331917720446333 /torch/csrc/cuda
parent	8f26d6aabcad991da88b663467ee2080a38631f7 (diff)
download	pytorch-5a96037810eb043235d53d9f9200867500259f65.tar.gz pytorch-5a96037810eb043235d53d9f9200867500259f65.tar.bz2 pytorch-5a96037810eb043235d53d9f9200867500259f65.zip