Robust NCCL barrier improvement to cover all devices combinations (#14271)

Summary: This covers the very edgy case when we run the same NCCL process group with multiple GPU combinations instead of the last GPU combination. We always keep track of what GPUs have been used previously in the NCCL process group and barrier() itself will synchronize on each GPU's NCCL stream. Test covered as well. Tested on 8-GPU machine Pull Request resolved: https://github.com/pytorch/pytorch/pull/14271 Differential Revision: D13164993 Pulled By: teng-li fbshipit-source-id: 81e04352740ea50b5e943369e74cfcba40bb61c1
author: Teng Li <tengli@fb.com> 2018-11-21 18:21:55 -0800
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2018-11-21 18:23:55 -0800
commit: b26f82b0ecc9f762a33fc271186daae1303aafd9 (patch)
tree: fb5f9daf66e4818cc6ab20e510ce17288ebedd21 /test/test_c10d.py
parent: b149456645a2e9e70bdac8aa9e6d47681442c200 (diff)
download: pytorch-b26f82b0ecc9f762a33fc271186daae1303aafd9.tar.gz
pytorch-b26f82b0ecc9f762a33fc271186daae1303aafd9.tar.bz2
pytorch-b26f82b0ecc9f762a33fc271186daae1303aafd9.zip
1 files changed, 30 insertions, 0 deletions
diff --git a/test/test_c10d.py b/test/test_c10d.py
index ab3053ebbf..3c67da1378 100644
--- a/test/test_c10d.py
+++ b/test/test_c10d.py
@@ -1113,6 +1113,36 @@ class ProcessGroupNCCLTest(TestCase):
             for s_idx, t in enumerate(device_ts):
                 self.assertEqual(torch.Tensor([s_idx]), t)
 
+    def test_barrier(self):
+        store = c10d.FileStore(self.file.name, self.world_size)
+        pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
+
+        def allreduce(tensors):
+            opts = c10d.AllreduceOptions()
+            work = pg.allreduce(tensors, opts)
+            return work
+
+        # Making the collective to operate on
+        # 1, 2, 3, 4, .... self.num_gpus GPUs
+        tensors_list = [[] for _ in range(2, self.num_gpus + 1)]
+        for i in range(2, self.num_gpus + 1):
+            for j in range(i):
+                tensors_list[i - 2].append(torch.Tensor([j + 1]).cuda(j))
+
+        works = []
+        for tensors in tensors_list:
+            work = allreduce(tensors)
+            works.append(work)
+
+        # Barrier will ensure that all previous work is completed
+        pg.barrier()
+
+        for i in range(2, self.num_gpus + 1):
+            for j in range(i):
+                self.assertEqual(
+                    torch.Tensor([float(i * (i + 1) / 2)]),
+                    tensors_list[i - 2][j])
+
 
 class Net(nn.Module):
     def __init__(self):
author	Teng Li <tengli@fb.com>	2018-11-21 18:21:55 -0800
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2018-11-21 18:23:55 -0800
commit	b26f82b0ecc9f762a33fc271186daae1303aafd9 (patch)
tree	fb5f9daf66e4818cc6ab20e510ce17288ebedd21 /test/test_c10d.py
parent	b149456645a2e9e70bdac8aa9e6d47681442c200 (diff)
download	pytorch-b26f82b0ecc9f762a33fc271186daae1303aafd9.tar.gz pytorch-b26f82b0ecc9f762a33fc271186daae1303aafd9.tar.bz2 pytorch-b26f82b0ecc9f762a33fc271186daae1303aafd9.zip