diff options
author | soumith <soumith@fb.com> | 2017-10-13 08:50:00 -0700 |
---|---|---|
committer | soumith <soumith@fb.com> | 2017-10-13 08:50:00 -0700 |
commit | 5a96037810eb043235d53d9f9200867500259f65 (patch) | |
tree | 95a80984895cd1d4061719dd8331917720446333 /torch/csrc/cuda | |
parent | 8f26d6aabcad991da88b663467ee2080a38631f7 (diff) | |
download | pytorch-5a96037810eb043235d53d9f9200867500259f65.tar.gz pytorch-5a96037810eb043235d53d9f9200867500259f65.tar.bz2 pytorch-5a96037810eb043235d53d9f9200867500259f65.zip |
skip ncclCommDestroy if CUDA driver is already unloaded
Diffstat (limited to 'torch/csrc/cuda')
-rw-r--r-- | torch/csrc/cuda/nccl.cpp | 11 |
1 files changed, 9 insertions, 2 deletions
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp index a035495156..0ae936dcdd 100644 --- a/torch/csrc/cuda/nccl.cpp +++ b/torch/csrc/cuda/nccl.cpp @@ -28,6 +28,13 @@ struct NcclCommList { ~NcclCommList() { if (comms) { for (int i = 0; i < ndevices; i++) { + int dummy_var; + if (cudaGetDevice(&dummy_var) != cudaSuccess) { + /* there are cases when this destructor is called after the + CUDA driver is already unloaded from the process. + In these cases, skip ncclCommDestroy */ + return; + } ncclCommDestroy(comms[i]); } } @@ -107,7 +114,7 @@ static void _check_inputs(std::vector<at::Tensor> &inputs, std::vector<at::Tenso if (input.numel() != numel) { throw std::runtime_error("all inputs must have the same number of elements"); } - + if (output.numel() * output_multiplier != numel * input_multiplier) { throw std::runtime_error("output must be of size input_size * size_multiplier"); } @@ -144,7 +151,7 @@ PyObject * THCPModule_nccl_reduce(PyObject *self, PyObject *args) { std::vector<THCStream*> streams = THPUtils_PySequence_to_THCStreamList(_streams); THPUtils_assert(inputs.size() == streams.size(), "number of streams is not equal to number of inputs"); - + // we can safely release GIL after this line, no python API used AutoNoGIL no_gil; _check_inputs(inputs, outputs, 1, 1); |