diff options
author | Teng Li <tengli@fb.com> | 2018-11-21 18:21:55 -0800 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2018-11-21 18:23:55 -0800 |
commit | b26f82b0ecc9f762a33fc271186daae1303aafd9 (patch) | |
tree | fb5f9daf66e4818cc6ab20e510ce17288ebedd21 /test/test_c10d.py | |
parent | b149456645a2e9e70bdac8aa9e6d47681442c200 (diff) | |
download | pytorch-b26f82b0ecc9f762a33fc271186daae1303aafd9.tar.gz pytorch-b26f82b0ecc9f762a33fc271186daae1303aafd9.tar.bz2 pytorch-b26f82b0ecc9f762a33fc271186daae1303aafd9.zip |
Robust NCCL barrier improvement to cover all devices combinations (#14271)
Summary:
This covers the very edgy case when we run the same NCCL process group with multiple GPU combinations instead of the last GPU combination. We always keep track of what GPUs have been used previously in the NCCL process group and barrier() itself will synchronize on each GPU's NCCL stream.
Test covered as well. Tested on 8-GPU machine
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14271
Differential Revision: D13164993
Pulled By: teng-li
fbshipit-source-id: 81e04352740ea50b5e943369e74cfcba40bb61c1
Diffstat (limited to 'test/test_c10d.py')
-rw-r--r-- | test/test_c10d.py | 30 |
1 files changed, 30 insertions, 0 deletions
diff --git a/test/test_c10d.py b/test/test_c10d.py index ab3053ebbf..3c67da1378 100644 --- a/test/test_c10d.py +++ b/test/test_c10d.py @@ -1113,6 +1113,36 @@ class ProcessGroupNCCLTest(TestCase): for s_idx, t in enumerate(device_ts): self.assertEqual(torch.Tensor([s_idx]), t) + def test_barrier(self): + store = c10d.FileStore(self.file.name, self.world_size) + pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + def allreduce(tensors): + opts = c10d.AllreduceOptions() + work = pg.allreduce(tensors, opts) + return work + + # Making the collective to operate on + # 1, 2, 3, 4, .... self.num_gpus GPUs + tensors_list = [[] for _ in range(2, self.num_gpus + 1)] + for i in range(2, self.num_gpus + 1): + for j in range(i): + tensors_list[i - 2].append(torch.Tensor([j + 1]).cuda(j)) + + works = [] + for tensors in tensors_list: + work = allreduce(tensors) + works.append(work) + + # Barrier will ensure that all previous work is completed + pg.barrier() + + for i in range(2, self.num_gpus + 1): + for j in range(i): + self.assertEqual( + torch.Tensor([float(i * (i + 1) / 2)]), + tensors_list[i - 2][j]) + class Net(nn.Module): def __init__(self): |