summaryrefslogtreecommitdiff
path: root/test/test_c10d.py
diff options
context:
space:
mode:
authorTeng Li <tengli@fb.com>2018-11-21 18:21:55 -0800
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2018-11-21 18:23:55 -0800
commitb26f82b0ecc9f762a33fc271186daae1303aafd9 (patch)
treefb5f9daf66e4818cc6ab20e510ce17288ebedd21 /test/test_c10d.py
parentb149456645a2e9e70bdac8aa9e6d47681442c200 (diff)
downloadpytorch-b26f82b0ecc9f762a33fc271186daae1303aafd9.tar.gz
pytorch-b26f82b0ecc9f762a33fc271186daae1303aafd9.tar.bz2
pytorch-b26f82b0ecc9f762a33fc271186daae1303aafd9.zip
Robust NCCL barrier improvement to cover all devices combinations (#14271)
Summary: This covers the very edgy case when we run the same NCCL process group with multiple GPU combinations instead of the last GPU combination. We always keep track of what GPUs have been used previously in the NCCL process group and barrier() itself will synchronize on each GPU's NCCL stream. Test covered as well. Tested on 8-GPU machine Pull Request resolved: https://github.com/pytorch/pytorch/pull/14271 Differential Revision: D13164993 Pulled By: teng-li fbshipit-source-id: 81e04352740ea50b5e943369e74cfcba40bb61c1
Diffstat (limited to 'test/test_c10d.py')
-rw-r--r--test/test_c10d.py30
1 files changed, 30 insertions, 0 deletions
diff --git a/test/test_c10d.py b/test/test_c10d.py
index ab3053ebbf..3c67da1378 100644
--- a/test/test_c10d.py
+++ b/test/test_c10d.py
@@ -1113,6 +1113,36 @@ class ProcessGroupNCCLTest(TestCase):
for s_idx, t in enumerate(device_ts):
self.assertEqual(torch.Tensor([s_idx]), t)
+ def test_barrier(self):
+ store = c10d.FileStore(self.file.name, self.world_size)
+ pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
+
+ def allreduce(tensors):
+ opts = c10d.AllreduceOptions()
+ work = pg.allreduce(tensors, opts)
+ return work
+
+ # Making the collective to operate on
+ # 1, 2, 3, 4, .... self.num_gpus GPUs
+ tensors_list = [[] for _ in range(2, self.num_gpus + 1)]
+ for i in range(2, self.num_gpus + 1):
+ for j in range(i):
+ tensors_list[i - 2].append(torch.Tensor([j + 1]).cuda(j))
+
+ works = []
+ for tensors in tensors_list:
+ work = allreduce(tensors)
+ works.append(work)
+
+ # Barrier will ensure that all previous work is completed
+ pg.barrier()
+
+ for i in range(2, self.num_gpus + 1):
+ for j in range(i):
+ self.assertEqual(
+ torch.Tensor([float(i * (i + 1) / 2)]),
+ tensors_list[i - 2][j])
+
class Net(nn.Module):
def __init__(self):