Use open-source NCCL2 in PyTorch (#12312)

Summary: - Removed the old nccl file - Make open-source NCCL a submodule - CMake to make NCCL itself NCCL2 now is in the default build. Pull Request resolved: https://github.com/pytorch/pytorch/pull/12312 Differential Revision: D10190845 Pulled By: teng-li fbshipit-source-id: 08d42253b774149a66919d194f88b34628c39bae
author: Teng Li <tengli@fb.com> 2018-10-04 11:31:26 -0700
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2018-10-04 11:42:17 -0700
commit: ae7a7fb398cb859732a5f6f0bfdfb87b803a2ade (patch)
tree: 6bd56dd8fc590ce3baa40cdfceddd58cb2186218 /third_party
parent: 6b79e16d6dbd5b3c71775c69a770a17cbe0b2f08 (diff)
download: pytorch-ae7a7fb398cb859732a5f6f0bfdfb87b803a2ade.tar.gz
pytorch-ae7a7fb398cb859732a5f6f0bfdfb87b803a2ade.tar.bz2
pytorch-ae7a7fb398cb859732a5f6f0bfdfb87b803a2ade.zip
59 files changed, 2 insertions, 9554 deletions
diff --git a/third_party/nccl/.gitignore b/third_party/nccl/.gitignore
deleted file mode 100644
index 34a07c2883..0000000000
--- a/third_party/nccl/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
-/build
diff --git a/third_party/nccl/CMakeLists.txt b/third_party/nccl/CMakeLists.txt
index 695a4d9f93..2ac095ff1d 100644
--- a/third_party/nccl/CMakeLists.txt
+++ b/third_party/nccl/CMakeLists.txt
@@ -11,7 +11,7 @@ string(REPLACE "-gencode;" "-gencode=" NVCC_GENCODE "${NVCC_GENCODE}")
 message(STATUS "Set NVCC_GENCODE for building NCCL: ${NVCC_GENCODE}")
 
 ADD_CUSTOM_COMMAND(
-   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/nccl
    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/lib/libnccl.so
    COMMAND env CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR} NVCC=${CUDA_NVCC_EXECUTABLE} BUILDDIR=${CMAKE_CURRENT_BINARY_DIR} NVCC_GENCODE="${NVCC_GENCODE}" make -j${NUM_JOBS}
 )
diff --git a/third_party/nccl/LICENSE.txt b/third_party/nccl/LICENSE.txt
deleted file mode 100644
index c7efd73e16..0000000000
--- a/third_party/nccl/LICENSE.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-
- Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
-  * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-  * Redistributions in binary form must reproduce the above copyright
-    notice, this list of conditions and the following disclaimer in the
-    documentation and/or other materials provided with the distribution.
-  * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
-    Laboratory, the U.S. Department of Energy, nor the names of their
-    contributors may be used to endorse or promote products derived
-    from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- The U.S. Department of Energy funded the development of this software
- under subcontract 7078610 with Lawrence Berkeley National Laboratory.
-
diff --git a/third_party/nccl/Makefile b/third_party/nccl/Makefile
deleted file mode 100644
index 96b0719c6a..0000000000
--- a/third_party/nccl/Makefile
+++ /dev/null
@@ -1,260 +0,0 @@
-#
-# Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
-#
-# See LICENCE.txt for license information
-#
-
-CUDA_HOME ?= /usr/local/cuda
-PREFIX ?= /usr/local
-VERBOSE ?= 0
-KEEP ?= 0
-DEBUG ?= 0
-PROFAPI ?= 0
-BUILDDIR ?= build
-BUILDDIR := $(abspath $(BUILDDIR))
-
-CUDA_LIB ?= $(CUDA_HOME)/lib64
-CUDA_INC ?= $(CUDA_HOME)/include
-NVCC ?= $(CUDA_HOME)/bin/nvcc
-
-CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
-CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
-CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
-
-ifeq ($(CUDA_MAJOR), 7)
-NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \
-                -gencode=arch=compute_35,code=sm_35 \
-                -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_52,code=sm_52 \
-                -gencode=arch=compute_52,code=compute_52
-else ifeq ($(CUDA_MAJOR), 8)
-NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \
-                -gencode=arch=compute_35,code=sm_35 \
-                -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_52,code=sm_52 \
-                -gencode=arch=compute_60,code=sm_60\
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_60,code=compute_60
-else
-NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \
-                -gencode=arch=compute_35,code=sm_35 \
-                -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_52,code=sm_52 \
-                -gencode=arch=compute_60,code=sm_60\
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_60,code=compute_60 \
-                -gencode=arch=compute_70,code=compute_70
-endif
-
-CXXFLAGS   := -I$(CUDA_INC) -fPIC -fvisibility=hidden
-NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -maxrregcount 96 -Xfatbin -compress-all
-# Use addprefix so that we can specify more than one path
-LDFLAGS    := $(addprefix -L,${CUDA_LIB}) -lcudart -lrt
-
-# If CUDA < 8.0, add workaround C++ flags
-CUDA_MAJOR_LT_8 := $(shell [ $(CUDA_MAJOR) -lt 8 ] && echo true)
-ifeq ($(CUDA_MAJOR_LT_8), true)
-CXXFLAGS += -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__
-endif
-
-ifeq ($(DEBUG), 0)
-NVCUFLAGS += -O3
-CXXFLAGS  += -O3
-else
-NVCUFLAGS += -O0 -G
-CXXFLAGS  += -O0 -g -ggdb3
-endif
-
-ifneq ($(VERBOSE), 0)
-NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
-CXXFLAGS  += -Wall -Wextra
-else
-.SILENT:
-endif
-
-ifneq ($(KEEP), 0)
-NVCUFLAGS += -keep
-endif
-
-ifneq ($(PROFAPI), 0)
-CXXFLAGS += -DPROFAPI
-endif
-
-NCCL_MAJOR   := 1
-NCCL_MINOR   := 3
-NCCL_PATCH   := 5
-CXXFLAGS  += -DNCCL_MAJOR=$(NCCL_MAJOR) -DNCCL_MINOR=$(NCCL_MINOR) -DNCCL_PATCH=$(NCCL_PATCH)
-
-CXXFLAGS  += -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR)
-
-.PHONY : all lib staticlib clean test mpitest install deb debian debclean forlib fortest forclean
-.DEFAULT : all
-
-INCEXPORTS  := nccl.h
-LIBSRCFILES := libwrap.cu core.cu all_gather.cu all_reduce.cu broadcast.cu reduce.cu reduce_scatter.cu
-LIBNAME     := libnccl.so
-STATICLIBNAME := libnccl_static.a
-
-INCDIR := $(BUILDDIR)/include
-LIBDIR := $(BUILDDIR)/lib
-OBJDIR := $(BUILDDIR)/obj
-
-INCTARGETS := $(patsubst %, $(INCDIR)/%, $(INCEXPORTS))
-LIBSONAME  := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
-LIBTARGET  := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
-STATICLIBTARGET := $(STATICLIBNAME)
-LIBLINK    := $(patsubst lib%.so, -l%, $(LIBNAME))
-LIBOBJ     := $(patsubst %.cu, $(OBJDIR)/%.o, $(filter %.cu, $(LIBSRCFILES)))
-DEPFILES   := $(patsubst %.o, %.d, $(LIBOBJ)) $(patsubst %, %.d, $(TESTBINS)) $(patsubst %, %.d, $(MPITESTBINS))
-
-all : lib staticlib
-
-lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
-
-staticlib : $(INCTARGETS) $(LIBDIR)/$(STATICLIBTARGET)
-
--include $(DEPFILES)
-
-$(LIBDIR)/$(LIBTARGET) : $(LIBOBJ)
-	@printf "Linking   %-35s > %s\n" $(LIBTARGET) $@
-	mkdir -p $(LIBDIR)
-	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LDFLAGS) $(LIBOBJ)
-	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
-	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
-
-$(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
-	@printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@
-	mkdir -p $(LIBDIR)
-	ar cr $@ $(LIBOBJ)
-
-$(INCDIR)/%.h : src/%.h
-	@printf "Grabbing  %-35s > %s\n" $< $@
-	mkdir -p $(INCDIR)
-	cp -f $< $@
-
-$(OBJDIR)/%.o : src/%.cu
-	@printf "Compiling %-35s > %s\n" $< $@
-	mkdir -p $(OBJDIR)
-	$(NVCC) -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
-	@$(NVCC) -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
-	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
-	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
-                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
-	@rm -f $(@:%.o=%.d.tmp)
-
-clean :
-	rm -rf $(BUILDDIR)
-
-install : lib
-	mkdir -p $(PREFIX)/lib
-	mkdir -p $(PREFIX)/include
-	cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
-	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
-
-
-#### TESTS ####
-
-TEST_ONLY ?= 0
-
-# Tests depend on lib, except in TEST_ONLY mode.
-ifeq ($(TEST_ONLY), 0)
-TSTDEP = $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
-endif
-
-NCCL_LIB ?= $(LIBDIR)
-NCCL_INC ?= $(INCDIR)
-
-MPI_HOME ?= /usr
-MPI_INC ?= $(MPI_HOME)/include
-MPI_LIB ?= $(MPI_HOME)/lib
-MPIFLAGS   := -I$(MPI_INC) -L$(MPI_LIB) -lmpi
-
-TESTS       := all_gather_test     all_gather_scan \
-               all_reduce_test     all_reduce_scan \
-               broadcast_test      broadcast_scan \
-               reduce_test         reduce_scan \
-               reduce_scatter_test reduce_scatter_scan
-MPITESTS    := mpi_test
-
-TSTINC     := -I$(NCCL_INC) -Itest/include
-TSTLIB     := -L$(NCCL_LIB) $(LIBLINK) $(LDFLAGS)
-TSTDIR     := $(BUILDDIR)/test/single
-MPITSTDIR  := $(BUILDDIR)/test/mpi
-TESTBINS   := $(patsubst %, $(TSTDIR)/%, $(TESTS))
-MPITESTBINS:= $(patsubst %, $(MPITSTDIR)/%, $(MPITESTS))
-
-test : $(TESTBINS)
-
-$(TSTDIR)/% : test/single/%.cu test/include/*.h $(TSTDEP)
-	@printf "Building  %-35s > %s\n" $< $@
-	mkdir -p $(TSTDIR)
-	$(NVCC) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt
-	@$(NVCC) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp)
-	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
-	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
-                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
-	@rm -f $(@:%=%.d.tmp)
-
-mpitest : $(MPITESTBINS)
-
-$(MPITSTDIR)/% : test/mpi/%.cu $(TSTDEP)
-	@printf "Building  %-35s > %s\n" $< $@
-	mkdir -p $(MPITSTDIR)
-	$(NVCC) $(MPIFLAGS) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcurand
-	@$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcurand > $(@:%=%.d.tmp)
-	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
-	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
-                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
-	@rm -f $(@:%=%.d.tmp)
-
-#### PACKAGING ####
-
-DEBIANDIR  := $(BUILDDIR)/debian
-
-DEBGEN_IN  := $(shell (cd debian ; ls *.in))
-DEBGEN     := $(DEBGEN_IN:.in=)
-DEBFILES   := compat copyright libnccl-dev.install libnccl-dev.manpages nccl.7 rules $(DEBGEN)
-DEBTARGETS := $(patsubst %, $(DEBIANDIR)/%, $(DEBFILES))
-
-DEB_REVISION   ?= 1
-DEB_TIMESTAMP  := $(shell date -R)
-DEB_ARCH       ?= amd64
-
-debian : $(DEBTARGETS)
-
-deb : lib debian
-	@printf "Building Debian package\n"
-	(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
-	mkdir -p $(BUILDDIR)/deb/
-	mv $(BUILDDIR)/../libnccl*.deb $(BUILDDIR)/deb/
-
-debclean :
-	rm -Rf $(DEBIANDIR)
-
-$(DEBIANDIR)/% : debian/%.in
-	@printf "Generating %-35s > %s\n" $< $@
-	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
-	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
-	    -e "s/\$${deb:Revision}/$(DEB_REVISION)/g" \
-	    -e "s/\$${deb:Timestamp}/$(DEB_TIMESTAMP)/g" \
-	    -e "s/\$${deb:Arch}/$(DEB_ARCH)/g" \
-	    $< > $@
-
-$(DEBIANDIR)/% : debian/%
-	@printf "Grabbing  %-35s > %s\n" $< $@
-	mkdir -p $(DEBIANDIR)
-	cp -f $< $@
-
-#### FORTRAN BINDINGS ####
-
-export NCCL_MAJOR NCCL_MINOR NCCL_PATCH CUDA_MAJOR CUDA_MINOR LIBLINK CUDA_LIB BUILDDIR
-
-forlib : lib
-	$(MAKE) -C fortran lib
-fortest : forlib
-	$(MAKE) -C fortran test
-forclean :
-	$(MAKE) -C fortran clean
diff --git a/third_party/nccl/README.md b/third_party/nccl/README.md
deleted file mode 100644
index 17b9546a2a..0000000000
--- a/third_party/nccl/README.md
+++ /dev/null
@@ -1,128 +0,0 @@
-**IMPORTANT NOTE**
-
-**NCCL1 is no longer maintained/updated and has been replaced by NCCL2, available at**
-
-**http://developer.nvidia.com/nccl.**
-
-# NCCL
-
-Optimized primitives for collective multi-GPU communication.
-
-## Introduction
-
-NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines, such as all-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single- or multi-process (e.g., MPI) applications.
-[This blog post](https://devblogs.nvidia.com/parallelforall/fast-multi-gpu-collectives-nccl/) provides details on NCCL functionality, goals, and performance.
-
-## What's inside
-
-At present, the library implements the following collectives:
-- all-reduce
-- all-gather
-- reduce-scatter
-- reduce
-- broadcast
-
-These collectives are implemented using ring algorithms and have been optimized primarily for throughput. For best performance, small collectives should be batched into larger operations whenever possible. Small test binaries demonstrating how to use each of the above collectives are also provided.
-
-## Requirements
-
-NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. Best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported.
-
-Note: NCCL may also work with CUDA 6.5, but this is an untested configuration.
-
-## Build & run
-
-To build the library and tests.
-
-```shell
-$ cd nccl
-$ make CUDA_HOME=<cuda install path> test
-```
-
-Test binaries are located in the subdirectories nccl/build/test/{single,mpi}.
-
-```shell
-$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./build/lib
-$ ./build/test/single/all_reduce_test
-Error: must specify at least data size in bytes!
-
-Tests nccl AllReduce with user supplied arguments.
-    Usage: all_reduce_test <data size in bytes> [number of GPUs] [GPU 0] [GPU 1] ...
-
-$ ./build/test/single/all_reduce_test 10000000
-# Using devices
-#   Device  0 ->  0 [0x0a] GeForce GTX TITAN X
-#   Device  1 ->  1 [0x09] GeForce GTX TITAN X
-#   Device  2 ->  2 [0x06] GeForce GTX TITAN X
-#   Device  3 ->  3 [0x05] GeForce GTX TITAN X
-
-#                                                 out-of-place                    in-place
-#      bytes             N    type      op     time  algbw  busbw      res     time  algbw  busbw      res
-    10000000      10000000    char     sum    1.628   6.14   9.21    0e+00    1.932   5.18   7.77    0e+00
-    10000000      10000000    char    prod    1.629   6.14   9.21    0e+00    1.643   6.09   9.13    0e+00
-    10000000      10000000    char     max    1.621   6.17   9.25    0e+00    1.634   6.12   9.18    0e+00
-    10000000      10000000    char     min    1.633   6.12   9.19    0e+00    1.637   6.11   9.17    0e+00
-    10000000       2500000     int     sum    1.611   6.21   9.31    0e+00    1.626   6.15   9.23    0e+00
-    10000000       2500000     int    prod    1.613   6.20   9.30    0e+00    1.629   6.14   9.21    0e+00
-    10000000       2500000     int     max    1.619   6.18   9.26    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000     int     min    1.619   6.18   9.27    0e+00    1.624   6.16   9.24    0e+00
-    10000000       5000000    half     sum    1.617   6.18   9.28    4e-03    1.636   6.11   9.17    4e-03
-    10000000       5000000    half    prod    1.618   6.18   9.27    1e-03    1.657   6.03   9.05    1e-03
-    10000000       5000000    half     max    1.608   6.22   9.33    0e+00    1.621   6.17   9.25    0e+00
-    10000000       5000000    half     min    1.610   6.21   9.32    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000   float     sum    1.618   6.18   9.27    5e-07    1.622   6.17   9.25    5e-07
-    10000000       2500000   float    prod    1.614   6.20   9.29    1e-07    1.628   6.14   9.21    1e-07
-    10000000       2500000   float     max    1.616   6.19   9.28    0e+00    1.633   6.12   9.19    0e+00
-    10000000       2500000   float     min    1.613   6.20   9.30    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double     sum    1.629   6.14   9.21    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double    prod    1.619   6.18   9.26    2e-16    1.628   6.14   9.21    2e-16
-    10000000       1250000  double     max    1.613   6.20   9.30    0e+00    1.630   6.13   9.20    0e+00
-    10000000       1250000  double     min    1.622   6.16   9.25    0e+00    1.623   6.16   9.24    0e+00
-```
-
-To install, run `make PREFIX=<install dir> install` and add `<instal dir>/lib` to your `LD_LIBRARY_PATH`.
-
-## Usage
-
-NCCL follows the MPI collectives API fairly closely. Before any collectives can be called, a communicator object must be initialized on each GPU. On a single-process machine, all GPUs can be conveniently initialized using `ncclCommInitAll`. For multi-process applications (e.g., with MPI), `ncclCommInitRank` must be called for each GPU. Internally `ncclCommInitRank` invokes a synchronization among all GPUs, so these calls must be invoked in different host threads (or processes) for each GPU. A brief single-process example follows, for an MPI example see test/mpi/mpi_test.cu. For details about the API see nccl.h.
-
-```c
-#include <nccl.h>
-
-typedef struct {
-  double* sendBuff;
-  double* recvBuff;
-  int size;
-  cudaStream_t stream;
-} PerThreadData;
-
-int main(int argc, char* argv[])
-{
-  int nGPUs;
-  cudaGetDeviceCount(&nGPUs);
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nGPUs);
-  ncclCommInitAll(comms, nGPUs); // initialize communicator
-                                // One communicator per process
-
-  PerThreadData* data;
-
-  ... // Allocate data and issue work to each GPU's
-      // perDevStream to populate the sendBuffs.
-
-  for(int i=0; i<nGPUs; ++i) {
-    cudaSetDevice(i); // Correct device must be set
-                      // prior to each collective call.
-    ncclAllReduce(data[i].sendBuff, data[i].recvBuff, size,
-        ncclDouble, ncclSum, comms[i], data[i].stream);
-  }
-
-  ... // Issue work into data[*].stream to consume buffers, etc.
-}
-```
-
-## Copyright and License
-
-NCCL is provided under the [BSD licence](LICENSE.txt). All source code and
-accompanying documentation is copyright (c) 2015-2016, NVIDIA CORPORATION. All
-rights reserved.
-
diff --git a/third_party/nccl/debian/.gitignore b/third_party/nccl/debian/.gitignore
deleted file mode 100644
index 1e97a9fea8..0000000000
--- a/third_party/nccl/debian/.gitignore
+++ /dev/null
@@ -1,7 +0,0 @@
-/*.debhelper.log
-/*.debhelper
-/*.substvars
-/tmp/
-/files
-/libnccl1/
-/libnccl-dev/
diff --git a/third_party/nccl/debian/changelog.in b/third_party/nccl/debian/changelog.in
deleted file mode 100644
index ad569a0f71..0000000000
--- a/third_party/nccl/debian/changelog.in
+++ /dev/null
@@ -1,5 +0,0 @@
-nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}-${deb:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
-
-  * Automatic Debian package from build
-
- -- cudatools <cudatools@nvidia.com>  ${deb:Timestamp}
diff --git a/third_party/nccl/debian/compat b/third_party/nccl/debian/compat
deleted file mode 100644
index ec635144f6..0000000000
--- a/third_party/nccl/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/third_party/nccl/debian/control.in b/third_party/nccl/debian/control.in
deleted file mode 100644
index e5ca48ebef..0000000000
--- a/third_party/nccl/debian/control.in
+++ /dev/null
@@ -1,28 +0,0 @@
-Source: nccl
-Section: libs
-Maintainer: cudatools <cudatools@nvidia.com>
-Priority: optional
-Build-depends: debhelper(>=9)
-Standards-Version: 3.9.5
-
-Package: libnccl${nccl:Major}
-Section: libs
-Architecture: ${deb:Arch}
-Depends: ${misc:Depends}, ${shlibs:Depends}
-Description: NVIDIA Collectives Communication Library (NCCL) Runtime
- NCCL (pronounced "Nickel") is a stand-alone library of standard collective
- communication routines for GPUs, such as all-gather, reduce, broadcast, etc.,
- that have been optimized to achieve high bandwidth over PCIe. NCCL supports up
- to eight GPUs and can be used in either single- or multi-process (e.g., MPI)
- applications.
-
-Package: libnccl-dev
-Section: libdevel
-Architecture: ${deb:Arch}
-Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
-Description: NVIDIA Collectives Communication Library (NCCL) Development Files
- NCCL (pronounced "Nickel") is a stand-alone library of standard collective
- communication routines for GPUs, such as all-gather, reduce, broadcast, etc.,
- that have been optimized to achieve high bandwidth over PCIe. NCCL supports up
- to eight GPUs and can be used in either single- or multi-process (e.g., MPI)
- applications.
diff --git a/third_party/nccl/debian/copyright b/third_party/nccl/debian/copyright
deleted file mode 120000
index 4ab43736a8..0000000000
--- a/third_party/nccl/debian/copyright
+++ /dev/null
@@ -1 +0,0 @@
-../LICENSE.txt
-\ No newline at end of file
diff --git a/third_party/nccl/debian/libnccl-dev.install b/third_party/nccl/debian/libnccl-dev.install
deleted file mode 100644
index 90299a0bee..0000000000
--- a/third_party/nccl/debian/libnccl-dev.install
+++ /dev/null
@@ -1,2 +0,0 @@
-include/nccl.h usr/include
-lib/libnccl.so /usr/lib/x86_64-linux-gnu
diff --git a/third_party/nccl/debian/libnccl-dev.manpages b/third_party/nccl/debian/libnccl-dev.manpages
deleted file mode 100644
index 4bfc2cb226..0000000000
--- a/third_party/nccl/debian/libnccl-dev.manpages
+++ /dev/null
@@ -1 +0,0 @@
-debian/nccl.7
diff --git a/third_party/nccl/debian/libnccl1.install.in b/third_party/nccl/debian/libnccl1.install.in
deleted file mode 100644
index 73b4c0a9dd..0000000000
--- a/third_party/nccl/debian/libnccl1.install.in
+++ /dev/null
@@ -1,2 +0,0 @@
-lib/libnccl.so.${nccl:Major} /usr/lib/x86_64-linux-gnu
-lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/x86_64-linux-gnu
diff --git a/third_party/nccl/debian/nccl.7 b/third_party/nccl/debian/nccl.7
deleted file mode 100644
index 0cb5601733..0000000000
--- a/third_party/nccl/debian/nccl.7
+++ /dev/null
@@ -1,139 +0,0 @@
-.TH NCCL
-.SH NAME
-.PP
-nccl \- Optimized primitives for collective multi\-GPU communication.
-
-.SH Introduction
-.PP
-NCCL (pronounced "Nickel") is a stand\-alone library of standard collective communication routines, such as all\-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single\- or multi\-process (e.g., MPI) applications.
-
-.SH What's inside
-.PP
-At present, the library implements the following collectives:
-\- all\-reduce
-\- all\-gather
-\- reduce\-scatter
-\- reduce
-\- broadcast
-
-.PP
-These collectives are implemented using ring algorithms and have been optimized primarily for throughput. For best performance, small collectives should be batched into larger operations whenever possible. Small test binaries demonstrating how to use each of the above collectives are also provided.
-
-.SH Requirements
-.PP
-NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. Best performance is achieved when all GPUs are located on a common PCIe root complex, but multi\-socket configurations are also supported.
-
-.PP
-Note: NCCL may also work with CUDA 6.5, but this is an untested configuration.
-
-.SH Build & run
-.PP
-To build the library and tests.
-
-.PP
-.RS
-
-.nf
-$ cd nccl
-$ make CUDA\_HOME=<cuda install path> test
-
-.fi
-.RE
-
-.PP
-Test binaries are located in the subdirectories nccl/build/test and nccl/build/mpitest.
-
-.PP
-.RS
-
-.nf
-$ export LD\_LIBRARY\_PATH=$LD\_LIBRARY\_PATH:./build/lib
-$ ./build/test/all\_reduce\_test
-Error: must specify at least data size in bytes!
-
-Tests nccl AllReduce with user supplied arguments.
-    Usage: all\_reduce\_test <data size in bytes> [number of GPUs] [GPU 0] [GPU 1] ...
-
-$ ./build/test/all\_reduce\_test 10000000
-# Using devices
-#   Device  0 \->  0 [0x0a] GeForce GTX TITAN X
-#   Device  1 \->  1 [0x09] GeForce GTX TITAN X
-#   Device  2 \->  2 [0x06] GeForce GTX TITAN X
-#   Device  3 \->  3 [0x05] GeForce GTX TITAN X
-
-#                                                 out\-of\-place                    in\-place
-#      bytes             N    type      op     time  algbw  busbw      res     time  algbw  busbw      res
-    10000000      10000000    char     sum    1.628   6.14   9.21    0e+00    1.932   5.18   7.77    0e+00
-    10000000      10000000    char    prod    1.629   6.14   9.21    0e+00    1.643   6.09   9.13    0e+00
-    10000000      10000000    char     max    1.621   6.17   9.25    0e+00    1.634   6.12   9.18    0e+00
-    10000000      10000000    char     min    1.633   6.12   9.19    0e+00    1.637   6.11   9.17    0e+00
-    10000000       2500000     int     sum    1.611   6.21   9.31    0e+00    1.626   6.15   9.23    0e+00
-    10000000       2500000     int    prod    1.613   6.20   9.30    0e+00    1.629   6.14   9.21    0e+00
-    10000000       2500000     int     max    1.619   6.18   9.26    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000     int     min    1.619   6.18   9.27    0e+00    1.624   6.16   9.24    0e+00
-    10000000       5000000    half     sum    1.617   6.18   9.28    4e\-03    1.636   6.11   9.17    4e\-03
-    10000000       5000000    half    prod    1.618   6.18   9.27    1e\-03    1.657   6.03   9.05    1e\-03
-    10000000       5000000    half     max    1.608   6.22   9.33    0e+00    1.621   6.17   9.25    0e+00
-    10000000       5000000    half     min    1.610   6.21   9.32    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000   float     sum    1.618   6.18   9.27    5e\-07    1.622   6.17   9.25    5e\-07
-    10000000       2500000   float    prod    1.614   6.20   9.29    1e\-07    1.628   6.14   9.21    1e\-07
-    10000000       2500000   float     max    1.616   6.19   9.28    0e+00    1.633   6.12   9.19    0e+00
-    10000000       2500000   float     min    1.613   6.20   9.30    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double     sum    1.629   6.14   9.21    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double    prod    1.619   6.18   9.26    2e\-16    1.628   6.14   9.21    2e\-16
-    10000000       1250000  double     max    1.613   6.20   9.30    0e+00    1.630   6.13   9.20    0e+00
-    10000000       1250000  double     min    1.622   6.16   9.25    0e+00    1.623   6.16   9.24    0e+00
-
-.fi
-.RE
-
-.PP
-To install, run \fB\fCmake PREFIX=<install dir> install\fR and add \fB\fC<instal dir>/lib\fR to your \fB\fCLD\_LIBRARY\_PATH\fR.
-
-.SH Usage
-.PP
-NCCL follows the MPI collectives API fairly closely. Before any collectives can be called, a communicator object must be initialized on each GPU. On a single\-process machine, all GPUs can be conveniently initialized using \fB\fCncclCommInitAll\fR. For multi\-process applications (e.g., with MPI), \fB\fCncclCommInitRank\fR must be called for each GPU. Internally \fB\fCncclCommInitRank\fR invokes a synchronization among all GPUs, so these calls must be invoked in different host threads (or processes) for each GPU. A brief single\-process example follows, for an MPI example see src/mpi\_test.cu. For details about the API see nccl.h.
-
-.PP
-.RS
-
-.nf
-#include <nccl.h>
-
-typedef struct \{
-  double* sendBuff;
-  double* recvBuff;
-  int size;
-  cudaStream\_t stream;
-\} PerThreadData;
-
-int main(int argc, char* argv[])
-\{
-  int nGPUs;
-  cudaGetDeviceCount(\&nGPUs);
-  ncclComm\_t* comms = (ncclComm\_t*)malloc(sizeof(ncclComm\_t)*nGPUs);
-  ncclCommInitAll(comms, nGPUs); // initialize communicator
-                                // One communicator per process
-
-  PerThreadData* data;
-
-  ... // Allocate data and issue work to each GPU's
-      // perDevStream to populate the sendBuffs.
-
-  for(int i=0; i<nGPUs; ++i) \{
-    cudaSetDevice(i); // Correct device must be set
-                      // prior to each collective call.
-    ncclAllReduce(data[i].sendBuff, data[i].recvBuff, size,
-        ncclDouble, ncclSum, comms[i], data[i].stream);
-  \}
-
-  ... // Issue work into data[*].stream to consume buffers, etc.
-\}
-
-.fi
-.RE
-
-.SH Copyright
-.PP
-All source code and accompanying documentation is copyright (c) 2015\-2016, NVIDIA CORPORATION. All
-rights reserved.
diff --git a/third_party/nccl/debian/rules b/third_party/nccl/debian/rules
deleted file mode 100755
index 23b90a9e01..0000000000
--- a/third_party/nccl/debian/rules
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/make -f
-
-%:
-	dh $@ --parallel
-
-override_dh_auto_install:
-	PREFIX=debian/tmp dh_auto_install
-
-override_dh_auto_test:
-	# Do not make test
-
-override_dh_auto_clean:
-	# Do not make clean
diff --git a/third_party/nccl/debian/shlibs.local.in b/third_party/nccl/debian/shlibs.local.in
deleted file mode 100644
index 82505da490..0000000000
--- a/third_party/nccl/debian/shlibs.local.in
+++ /dev/null
@@ -1 +0,0 @@
-libcudart ${cuda:Major}.${cuda:Minor} cuda-cudart-${cuda:Major}-${cuda:Minor}
diff --git a/third_party/nccl/debian/source/format b/third_party/nccl/debian/source/format
deleted file mode 100644
index 89ae9db8f8..0000000000
--- a/third_party/nccl/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (native)
diff --git a/third_party/nccl/fortran/Makefile b/third_party/nccl/fortran/Makefile
deleted file mode 100644
index b60b0165b7..0000000000
--- a/third_party/nccl/fortran/Makefile
+++ /dev/null
@@ -1,81 +0,0 @@
-FC := gfortran
-FCNAME := $(notdir $(FC))
-
-BUILDDIR ?= ../build
-INCDIR := $(BUILDDIR)/include
-LIBDIR := $(BUILDDIR)/lib
-OBJDIR := $(BUILDDIR)/obj
-
-LIBNAME    := libncclfor.so
-LIBSONAME  := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
-LIBTARGET  := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
-LIBLINK    += $(patsubst lib%.so,-l%,$(LIBNAME))
-
-LIBCUDAFOR := libcudafor.so
-
-ifneq ($(filter pgf%, $(FCNAME)), )
-# PGI compiler (pgfortran, pgf90, pgf95)
-FCMODFLAGS  := -module $(INCDIR)
-FCPREFLAGS  := -Mpreprocess
-FCCUDAFLAGS := -Mcuda,cuda$(CUDA_MAJOR).$(CUDA_MINOR)
-FCFLAGS     := -fast -O3
-else
-# non-PGI compilers do not have CUDA support, compile our own CUDA lib
-CUDAFORDEP  := $(LIBDIR)/$(LIBCUDAFOR)
-CUDALINK    := -L$(CUDA_LIB) -lcudart
-CUDAFORLINK := -lcudafor
-ifeq ($(FCNAME), gfortran)
-FCMODFLAGS  := -J$(INCDIR)
-FCPREFLAGS  += -cpp
-FCFLAGS     += -ffree-line-length-none
-else ifeq ($(FCNAME), ifort)
-FCMODFLAGS  := -module $(INCDIR)
-FCPREFLAGS  += -fpp
-endif
-endif
-
-ifeq ($(VERBOSE), 0)
-.SILENT:
-endif
-
-lib: $(CUDAFORDEP)
-	$(MAKE) $(LIBDIR)/$(LIBTARGET)
-
-$(LIBDIR)/$(LIBTARGET): $(OBJDIR)/ncclfor.o
-	@printf "Linking   %-35s > %s\n" $(LIBTARGET) $@
-	mkdir -p $(LIBDIR)
-	$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) $< -o $(LIBDIR)/$(LIBTARGET)
-	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
-	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
-
-$(LIBDIR)/$(LIBCUDAFOR): $(OBJDIR)/cudafor.o
-	@printf "Linking   %-35s > %s\n" $(LIBCUDAFOR) $@
-	mkdir -p $(LIBDIR)
-	$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBCUDAFOR) $< -o $(LIBDIR)/$(LIBCUDAFOR)
-
-$(OBJDIR)/%.o: src/%.f90
-	@printf "Building  %-35s > %s\n" $< $@
-	mkdir -p $(OBJDIR)
-	mkdir -p $(INCDIR)
-	$(FC) -c $(FCMODFLAGS) $(FCPREFLAGS) -fPIC $(FCCUDAFLAGS) $(FCFLAGS) $< -o $@
-
-TESTS := reduce_ptr_out allreduce_ptr_out reducescatter_ptr_out broadcast_ptr allgather_ptr_out
-ifneq ($(filter pgf%, $(FCNAME)), )
-TESTS += reduce_arr_out allreduce_arr_out reducescatter_arr_out broadcast_arr allgather_arr_out
-endif
-
-TESTDIR  := $(BUILDDIR)/test/fortran
-TESTBINS := $(patsubst %,$(TESTDIR)/%,$(TESTS))
-
-test: lib $(TESTBINS)
-
-$(TESTDIR)/%: test/%.f90 lib
-	@printf "Building  %-35s > %s\n" $< $@
-	@mkdir -p $(TESTDIR)
-	$(FC) $(FCCUDAFLAGS) $(FCFLAGS) $< $(CUDALINK) -I$(INCDIR) -L$(LIBDIR) $(CUDAFORLINK) $(LIBLINK) -o $@
-
-clean:
-	rm -f $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(LIBSONAME) $(LIBDIR)/$(LIBNAME)
-	rm -f $(LIBDIR)/$(LIBCUDAFOR) $(OBJDIR)/*for.o $(INCDIR)/*.mod
-	rm -rf $(TESTDIR)/
-
diff --git a/third_party/nccl/fortran/src/cudafor.f90 b/third_party/nccl/fortran/src/cudafor.f90
deleted file mode 100644
index 4ecd0f41b8..0000000000
--- a/third_party/nccl/fortran/src/cudafor.f90
+++ /dev/null
@@ -1,171 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-#ifndef _CUDA
-
-!Start cudaFor module
-module cudaFor
-use iso_c_binding
-implicit none
-private
-public :: c_devptr
-public :: cudaMemcpyKind,           &
-          cudaMemcpyHostToHost,     &
-          cudaMemcpyHostToDevice,   &
-          cudaMemcpyDeviceToHost,   &
-          cudaMemcpyDeviceToDevice, &
-          cudaMemcpyDefault
-public :: cuda_stream_kind
-public :: cudaGetDeviceCount
-public :: cudaSetDevice
-public :: cudaMalloc
-public :: cudaMemcpy
-public :: cudaFree
-public :: cudaStreamCreate
-public :: cudaStreamSynchronize
-public :: cudaStreamDestroy
-
-!Start types
-
-!Start c_devptr
-type, bind(c) :: c_devptr
-type(c_ptr) :: member
-end type c_devptr
-!End c_devptr
-
-!Start cudaMemcpyKind
-type, bind(c) :: cudaMemcpyKind
-integer(c_int) :: member
-end type cudaMemcpyKind
-
-type(cudaMemcpyKind), parameter :: cudaMemcpyHostToHost     = cudaMemcpyKind(0), &
-                                   cudaMemcpyHostToDevice   = cudaMemcpyKind(1), &
-                                   cudaMemcpyDeviceToHost   = cudaMemcpyKind(2), &
-                                   cudaMemcpyDeviceToDevice = cudaMemcpyKind(3), &
-                                   cudaMemcpyDefault        = cudaMemcpyKind(4)
-!End cudaMemcpyKind
-
-!Start cuda_stream_kind
-integer(c_intptr_t), parameter :: cuda_stream_kind = c_intptr_t
-!End cuda_stream_kind
-
-!End types
-
-!Start interfaces
-
-!Start cudaGetDeviceCount
-interface cudaGetDeviceCount
-integer(c_int) function cudaGetDeviceCount(count) bind(c, name = "cudaGetDeviceCount")
-import :: c_int
-implicit none
-integer(c_int) :: count
-end function cudaGetDeviceCount
-end interface cudaGetDeviceCount
-!End cudaGetDeviceCount
-
-!Start cudaSetDevice
-interface cudaSetDevice
-integer(c_int) function cudaSetDevice(device) bind(c, name = "cudaSetDevice")
-import :: c_int
-implicit none
-integer(c_int), value :: device
-end function cudaSetDevice
-end interface cudaSetDevice
-!End cudaSetDevice
-
-!Start cudaMalloc
-interface cudaMalloc
-integer(c_int) function cudaMalloc(devPtr, size) bind(c, name = "cudaMalloc")
-import :: c_int, c_size_t
-import :: c_devptr
-implicit none
-type(c_devptr) :: devPtr
-integer(c_size_t), value :: size
-end function cudaMalloc
-end interface cudaMalloc
-!End cudaMalloc
-
-!Start cudaMemcpy
-interface cudaMemcpy
-
-!Start cudaMemcpyH2D
-integer(c_int) function cudaMemcpyH2D(dst, src, count, kind) bind(c, name = "cudaMemcpy")
-import :: c_ptr, c_int, c_size_t
-import :: c_devptr, cudaMemcpyKind
-implicit none
-type(c_devptr), value :: dst
-type(c_ptr), value :: src
-integer(c_size_t), value :: count
-type(cudaMemcpyKind), value :: kind
-end function cudaMemcpyH2D
-!End cudaMemcpyH2D
-
-!Start cudaMemcpyD2H
-integer(c_int) function cudaMemcpyD2H(dst, src, count, kind) bind(c, name = "cudaMemcpy")
-import :: c_ptr, c_int, c_size_t
-import :: c_devptr, cudaMemcpyKind
-implicit none
-type(c_ptr), value :: dst
-type(c_devptr), value :: src
-integer(c_size_t), value :: count
-type(cudaMemcpyKind), value :: kind
-end function cudaMemcpyD2H
-!End cudaMemcpyD2H
-
-end interface cudaMemcpy
-!End cudaMemcpy
-
-!Start cudaFree
-interface cudaFree
-integer(c_int) function cudaFree(devPtr) bind(c, name = "cudaFree")
-import :: c_int
-import :: c_devptr
-implicit none
-type(c_devptr), value :: devPtr
-end function cudaFree
-end interface cudaFree
-!End cudaFree
-
-!Start cudaStreamCreate
-interface cudaStreamCreate
-integer(c_int) function cudaStreamCreate(pStream) bind(c, name = "cudaStreamCreate")
-import :: c_int
-import :: cuda_stream_kind
-implicit none
-integer(cuda_stream_kind) :: pStream
-end function cudaStreamCreate
-end interface cudaStreamCreate
-!End cudaStreamCreate
-
-!Start cudaStreamSynchronize
-interface cudaStreamSynchronize
-integer(c_int) function cudaStreamSynchronize(stream) bind(c, name = "cudaStreamSynchronize")
-import :: c_int
-import :: cuda_stream_kind
-implicit none
-integer(cuda_stream_kind), value :: stream
-end function cudaStreamSynchronize
-end interface cudaStreamSynchronize
-!End cudaStreamSynchronize
-
-!Start cudaStreamDestroy
-interface cudaStreamDestroy
-integer(c_int) function cudaStreamDestroy(stream) bind(c, name = "cudaStreamDestroy")
-import :: c_int
-import :: cuda_stream_kind
-implicit none
-integer(cuda_stream_kind), value :: stream
-end function cudaStreamDestroy
-end interface cudaStreamDestroy
-!End cudaStreamDestroy
-
-!End interfaces
-
-end module cudaFor
-!End cudaFor module
-
-#endif
diff --git a/third_party/nccl/fortran/src/ncclfor.f90 b/third_party/nccl/fortran/src/ncclfor.f90
deleted file mode 100644
index 2ed4d3d874..0000000000
--- a/third_party/nccl/fortran/src/ncclfor.f90
+++ /dev/null
@@ -1,312 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-!Start defines
-#define NCCL_UNIQUE_ID_BYTES 128
-!End defines
-
-!Start ncclFor module
-module ncclFor
-use iso_c_binding
-use cudaFor
-implicit none
-private
-public :: ncclUniqueId
-public :: ncclComm
-public :: ncclResult,                 &
-          ncclSuccess,                &
-          ncclUnhandledCudaError,     &
-          ncclSystemError,            &
-          ncclInternalError,          &
-          ncclInvalidDevicePointer,   &
-          ncclInvalidRank,            &
-          ncclUnsupportedDeviceCount, &
-          ncclDeviceNotFound,         &
-          ncclInvalidDeviceIndex,     &
-          ncclLibWrapperNotSet,       &
-          ncclCudaMallocFailed,       &
-          ncclRankMismatch,           &
-          ncclInvalidArgument,        &
-          ncclInvalidType,            &
-          ncclInvalidOperation,       &
-          nccl_NUM_RESULTS
-public :: ncclDataType, &
-          ncclChar,     &
-          ncclInt,      &
-#ifdef CUDA_HAS_HALF
-          ncclHalf,     &
-#endif
-          ncclFloat,    &
-          ncclDouble,   &
-          ncclInt64,    &
-          ncclUInt64,   &
-          nccl_NUM_TYPES
-public :: ncclRedOp, &
-          ncclSum,   &
-          ncclProd,  &
-          ncclMax,   &
-          ncclMin,   &
-          nccl_NUM_OPS
-public :: ncclGetUniqueId
-public :: ncclCommInitRank
-public :: ncclCommInitAll
-public :: ncclCommCuDevice
-public :: ncclCommUserRank
-public :: ncclCommCount
-public :: ncclCommDestroy
-public :: ncclReduce
-public :: ncclAllReduce
-public :: ncclReduceScatter
-public :: ncclBcast
-public :: ncclAllGather
-
-!Start types
-
-!Start ncclUniqueId
-type, bind(c) :: ncclUniqueId
-character(c_char) :: internal(NCCL_UNIQUE_ID_BYTES)
-end type ncclUniqueId
-!End ncclUniqueId
-
-!Start ncclComm
-type, bind(c) :: ncclComm
-type(c_ptr) :: member
-end type ncclComm
-!End ncclComm
-
-!Start ncclResult
-type, bind(c) :: ncclResult
-integer(c_int) :: member
-end type ncclResult
-
-type(ncclResult), parameter :: ncclSuccess                = ncclResult( 0), &
-                               ncclUnhandledCudaError     = ncclResult( 1), &
-                               ncclSystemError            = ncclResult( 2), &
-                               ncclInternalError          = ncclResult( 3), &
-                               ncclInvalidDevicePointer   = ncclResult( 4), &
-                               ncclInvalidRank            = ncclResult( 5), &
-                               ncclUnsupportedDeviceCount = ncclResult( 6), &
-                               ncclDeviceNotFound         = ncclResult( 7), &
-                               ncclInvalidDeviceIndex     = ncclResult( 8), &
-                               ncclLibWrapperNotSet       = ncclResult( 9), &
-                               ncclCudaMallocFailed       = ncclResult(10), &
-                               ncclRankMismatch           = ncclResult(11), &
-                               ncclInvalidArgument        = ncclResult(12), &
-                               ncclInvalidType            = ncclResult(13), &
-                               ncclInvalidOperation       = ncclResult(14), &
-                               nccl_NUM_RESULTS           = ncclResult(15)
-!End ncclResult
-
-!Start ncclDataType
-type, bind(c) :: ncclDataType
-integer(c_int) :: member
-end type ncclDataType
-
-type(ncclDataType), parameter :: ncclChar       = ncclDataType(0), &
-                                 ncclInt        = ncclDataType(1), &
-#ifdef CUDA_HAS_HALF
-                                 ncclHalf       = ncclDataType(2), &
-#endif
-                                 ncclFloat      = ncclDataType(3), &
-                                 ncclDouble     = ncclDataType(4), &
-                                 ncclInt64      = ncclDataType(5), &
-                                 ncclUInt64     = ncclDataType(6), &
-                                 nccl_NUM_TYPES = ncclDataType(7)
-!End ncclDataType
-
-!Start ncclRedOp
-type, bind(c) :: ncclRedOp
-integer(c_int) :: member
-end type ncclRedOp
-
-type(ncclRedOp), parameter :: ncclSum      = ncclRedOp(0), &
-                              ncclProd     = ncclRedOp(1), &
-                              ncclMax      = ncclRedOp(2), &
-                              ncclMin      = ncclRedOp(3), &
-                              nccl_NUM_OPS = ncclRedOp(4)
-!End ncclRedOp
-
-!End types
-
-!Start interfaces
-
-!Start ncclGetUniqueId
-interface ncclGetUniqueId
-type(ncclResult) function ncclGetUniqueId(uniqueId) bind(c, name = 'ncclGetUniqueId')
-import :: ncclResult, ncclUniqueId
-implicit none
-type(ncclUniqueId) :: uniqueId
-end function ncclGetUniqueId
-end interface ncclGetUniqueId
-!End ncclGetUniqueId
-
-!Start ncclCommInitRank
-interface ncclCommInitRank
-type(ncclResult) function ncclCommInitRank(comm, ndev, commId, rank) bind(c, name = 'ncclCommInitRank')
-import :: c_int
-import :: ncclResult, ncclUniqueId, ncclComm
-implicit none
-type(ncclComm) :: comm(*)
-integer(c_int), value :: ndev
-type(ncclUniqueId), value :: commId
-integer(c_int), value :: rank
-end function ncclCommInitRank
-end interface ncclCommInitRank
-!End ncclCommInitRank
-
-!Start ncclCommInitAll
-interface ncclCommInitAll
-type(ncclResult) function ncclCommInitAll(comm, ndev, devlist) bind(c, name = 'ncclCommInitAll')
-import :: c_int
-import :: ncclResult, ncclComm
-implicit none
-type(ncclComm) :: comm(*)
-integer(c_int), value :: ndev
-integer(c_int) :: devlist(*)
-end function ncclCommInitAll
-end interface ncclCommInitAll
-!End ncclCommInitAll
-
-!Start ncclCommCuDevice
-interface ncclCommCuDevice
-type(ncclResult) function ncclCommCuDevice(comm, devid) bind(c, name = 'ncclCommCuDevice')
-import :: c_int
-import :: ncclResult, ncclComm
-implicit none
-type(ncclComm), value :: comm
-integer(c_int) :: devid
-end function ncclCommCuDevice
-end interface ncclCommCuDevice
-!End ncclCommCuDevice
-
-!Start ncclCommUserRank
-interface ncclCommUserRank
-type(ncclResult) function ncclCommUserRank(comm, rank) bind(c, name = 'ncclCommUserRank')
-import :: c_int
-import :: ncclResult, ncclComm
-implicit none
-type(ncclComm), value :: comm
-integer(c_int) :: rank
-end function ncclCommUserRank
-end interface ncclCommUserRank
-!End ncclCommUserRank
-
-!Start ncclCommCount
-interface ncclCommCount
-type(ncclResult) function ncclCommCount(comm, count) bind(c, name = 'ncclCommCount')
-import :: c_int
-import :: ncclResult, ncclComm
-implicit none
-type(ncclComm), value :: comm
-integer(c_int) :: count
-end function ncclCommCount
-end interface ncclCommCount
-!End ncclCommCount
-
-!Start ncclCommDestroy
-interface ncclCommDestroy
-subroutine ncclCommDestroy(comm) bind(c, name = 'ncclCommDestroy')
-import :: ncclComm
-implicit none
-type(ncclComm), value :: comm
-end subroutine ncclCommDestroy
-end interface ncclCommDestroy
-!End ncclCommDestroy
-
-!Start ncclReduce
-interface ncclReduce
-type(ncclResult) function ncclReduce(sendbuff, recvbuff, count, datatype, op, root, comm, stream) bind(c, name = 'ncclReduce')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
-implicit none
-type(c_devptr), value :: sendbuff
-type(c_devptr), value :: recvbuff
-integer(c_int), value :: count
-type(ncclDataType), value :: datatype
-type(ncclRedOp), value :: op
-integer(c_int), value :: root
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclReduce
-end interface ncclReduce
-!End ncclReduce
-
-!Start ncclAllReduce
-interface ncclAllReduce
-type(ncclResult) function ncclAllReduce(sendbuff, recvbuff, count, datatype, op, comm, stream) bind(c, name = 'ncclAllReduce')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
-implicit none
-type(c_devptr), value :: sendbuff
-type(c_devptr), value :: recvbuff
-integer(c_int), value :: count
-type(ncclDataType), value :: datatype
-type(ncclRedOp), value :: op
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclAllReduce
-end interface ncclAllReduce
-!End ncclAllReduce
-
-!Start ncclReduceScatter
-interface ncclReduceScatter
-type(ncclResult) function ncclReduceScatter(sendbuff, recvbuff, recvcount, datatype, op, comm, stream) bind(c, name = 'ncclReduceScatter')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
-implicit none
-type(c_devptr), value :: sendbuff
-type(c_devptr), value :: recvbuff
-integer(c_int), value :: recvcount
-type(ncclDataType), value :: datatype
-type(ncclRedOp), value :: op
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclReduceScatter
-end interface ncclReduceScatter
-!End ncclReduceScatter
-
-!Start ncclBcast
-interface ncclBcast
-type(ncclResult) function ncclBcast(buff, count, datatype, root, comm, stream) bind(c, name = 'ncclBcast')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType
-implicit none
-type(c_devptr), value :: buff
-integer(c_int), value :: count
-type(ncclDataType), value :: datatype
-integer(c_int), value :: root
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclBcast
-end interface ncclBcast
-!End ncclBcast
-
-!Start ncclAllGather
-interface ncclAllGather
-type(ncclResult) function ncclAllGather(sendbuff, count, datatype, recvbuff, comm, stream) bind(c, name = 'ncclAllGather')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType
-implicit none
-type(c_devptr), value :: sendbuff
-integer(c_int), value :: count
-type(ncclDataType), value :: datatype
-type(c_devptr), value :: recvbuff
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclAllGather
-end interface ncclAllGather
-!End ncclAllGather
-
-!End interfaces
-
-end module ncclFor
-!End nccl module
diff --git a/third_party/nccl/fortran/test/allgather_arr_out.f90 b/third_party/nccl/fortran/test/allgather_arr_out.f90
deleted file mode 100644
index 17fbf7a7bc..0000000000
--- a/third_party/nccl/fortran/test/allgather_arr_out.f90
+++ /dev/null
@@ -1,162 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: sendBuff(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-real(real32), allocatable, device :: recvBuff(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl * nDev, nDev + 1))
-
-  call random_number(hostBuff)
-
-  print "(a)", "before allgather:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(sendBuff(nEl))
-    sendBuffPtr(i) = c_devloc(sendBuff)
-    sendBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(recvBuff(nEl * nDev))
-    recvBuffPtr(i) = c_devloc(recvBuff)
-    recvBuff = hostBuff(:, i)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
-    hostBuff(:, i) = recvBuff
-  end do
-
-  print "(a)", ""
-  print "(a)", "after allgather:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    hostBuff((i - 1) * nEl + 1:i * nEl, 1) = sendBuff
-  end do
-
-  err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
-  print "(a)", ""
-  print "(a, e11.4e2)", "maximum error in sendbuff = ", err
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
-    deallocate(recvBuff)
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    deallocate(sendBuff)
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/allgather_ptr_out.f90 b/third_party/nccl/fortran/test/allgather_ptr_out.f90
deleted file mode 100644
index f7d196284a..0000000000
--- a/third_party/nccl/fortran/test/allgather_ptr_out.f90
+++ /dev/null
@@ -1,171 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl * nDev, nDev + 1))
-
-  call random_number(hostBuff)
-
-  print "(a)", "before allgather:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
-  end do
-
-  allocate(hostBuffPtr(nDev))
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
-    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
-  end do
-
-  print "(a)", ""
-  print "(a)", "after allgather:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
-  end do
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, 1))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-  end do
-
-  err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
-  print "(a)", ""
-  print "(a, e11.4e2)", "maximum error in sendbuff = ", err
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(recvBuffPtr(i))
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(sendBuffPtr(i))
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/allreduce_arr_out.f90 b/third_party/nccl/fortran/test/allreduce_arr_out.f90
deleted file mode 100644
index 50c1b6488a..0000000000
--- a/third_party/nccl/fortran/test/allreduce_arr_out.f90
+++ /dev/null
@@ -1,165 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: sendBuff(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-real(real32), allocatable, device :: recvBuff(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before allreduce:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(sendBuff(nEl))
-    sendBuffPtr(i) = c_devloc(sendBuff)
-    sendBuff = hostBuff(:, i)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(recvBuff(nEl))
-    recvBuffPtr(i) = c_devloc(recvBuff)
-    recvBuff = hostBuff(:, i)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  print "(a)", ""
-  print "(a)", "after allreduce:"
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    hostBuff(:, nDev + 1) = recvBuff
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    hostBuff(:, nDev + 1) = sendBuff
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    deallocate(recvBuff)
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    deallocate(sendBuff)
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/allreduce_ptr_out.f90 b/third_party/nccl/fortran/test/allreduce_ptr_out.f90
deleted file mode 100644
index 2c1248f312..0000000000
--- a/third_party/nccl/fortran/test/allreduce_ptr_out.f90
+++ /dev/null
@@ -1,166 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before allreduce:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  allocate(hostBuffPtr(nDev + 1))
-
-  do i = 1, nDev + 1
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  print "(a)", ""
-  print "(a)", "after allreduce:"
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(recvBuffPtr(i))
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(sendBuffPtr(i))
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/broadcast_arr.f90 b/third_party/nccl/fortran/test/broadcast_arr.f90
deleted file mode 100644
index 867fa1aadb..0000000000
--- a/third_party/nccl/fortran/test/broadcast_arr.f90
+++ /dev/null
@@ -1,137 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev, root
-type(ncclDataType) :: dataType
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: devBuff(:)
-type(c_devptr), allocatable :: devBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-!  root = 0
-  stat = cudaGetDeviceCount(nDev)
-  root = nDev - 1
-
-  dataType = ncclFloat
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 1))
-
-  call random_number(hostBuff(:, 1:nDev))
-
-  hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
-
-  print "(a)", "before broadcast:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
-  end do
-
-  allocate(devBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(devBuff(nEl))
-    devBuffPtr(i) = c_devloc(devBuff)
-    devBuff = hostBuff(:, i)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
-    hostBuff(:, i) = devBuff
-  end do
-
-  print "(a)", ""
-  print "(a)", "after broadcast:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
-    deallocate(devBuff)
-  end do
-
-  deallocate(devBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/broadcast_ptr.f90 b/third_party/nccl/fortran/test/broadcast_ptr.f90
deleted file mode 100644
index 963afeed40..0000000000
--- a/third_party/nccl/fortran/test/broadcast_ptr.f90
+++ /dev/null
@@ -1,142 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev, root
-type(ncclDataType) :: dataType
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: devBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-!  root = 0
-  stat = cudaGetDeviceCount(nDev)
-  root = nDev - 1
-
-  dataType = ncclFloat
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 1))
-
-  call random_number(hostBuff(:, 1:nDev))
-
-  hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
-
-  print "(a)", "before broadcast:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
-  end do
-
-  allocate(hostBuffPtr(nDev))
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(devBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(devBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-  end do
-
-  print "(a)", ""
-  print "(a)", "after broadcast:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(devBuffPtr(i))
-  end do
-
-  deallocate(devBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/reduce_arr_out.f90 b/third_party/nccl/fortran/test/reduce_arr_out.f90
deleted file mode 100644
index 17e41b45fe..0000000000
--- a/third_party/nccl/fortran/test/reduce_arr_out.f90
+++ /dev/null
@@ -1,164 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev, root
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: sendBuff(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-real(real32), allocatable, device :: recvBuff(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-!  root = 0
-  stat = cudaGetDeviceCount(nDev)
-  root = nDev - 1
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before reduce:"
-  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(sendBuff(nEl))
-    sendBuffPtr(i) = c_devloc(sendBuff)
-    sendBuff = hostBuff(:, i)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(recvBuff(nEl))
-    recvBuffPtr(i) = c_devloc(recvBuff)
-    recvBuff = hostBuff(:, i)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  stat = cudaSetDevice(devList(root + 1))
-  call c_f_pointer(recvBuffPtr(root + 1), recvBuff, [nEl])
-  hostBuff(:, nDev + 1) = recvBuff
-
-  print "(a)", ""
-  print "(a)", "after reduce:"
-  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    hostBuff(:, nDev + 1) = sendBuff
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    deallocate(recvBuff)
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    deallocate(sendBuff)
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/reduce_ptr_out.f90 b/third_party/nccl/fortran/test/reduce_ptr_out.f90
deleted file mode 100644
index 777f8ea076..0000000000
--- a/third_party/nccl/fortran/test/reduce_ptr_out.f90
+++ /dev/null
@@ -1,165 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev, root
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-!  root = 0
-  stat = cudaGetDeviceCount(nDev)
-  root = nDev - 1
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before reduce:"
-  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
-
-  allocate(hostBuffPtr(nDev + 1))
-
-  do i = 1, nDev + 1
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  stat = cudaSetDevice(devList(root + 1))
-  stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(root + 1), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-
-  print "(a)", ""
-  print "(a)", "after reduce:"
-  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(recvBuffPtr(i))
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(sendBuffPtr(i))
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/reducescatter_arr_out.f90 b/third_party/nccl/fortran/test/reducescatter_arr_out.f90
deleted file mode 100644
index 6a976dac1f..0000000000
--- a/third_party/nccl/fortran/test/reducescatter_arr_out.f90
+++ /dev/null
@@ -1,165 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: sendBuff(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-real(real32), allocatable, device :: recvBuff(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl * nDev, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before reducescatter:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(sendBuff(nEl * nDev))
-    sendBuffPtr(i) = c_devloc(sendBuff)
-    sendBuff = hostBuff(:, i)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(recvBuff(nEl))
-    recvBuffPtr(i) = c_devloc(recvBuff)
-    recvBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  print "(a)", ""
-  print "(a)", "after reducescatter:"
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) = recvBuff
-    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
-    hostBuff(:, nDev + 1) = sendBuff
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    deallocate(recvBuff)
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
-    deallocate(sendBuff)
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/fortran/test/reducescatter_ptr_out.f90 b/third_party/nccl/fortran/test/reducescatter_ptr_out.f90
deleted file mode 100644
index 9df35bffe8..0000000000
--- a/third_party/nccl/fortran/test/reducescatter_ptr_out.f90
+++ /dev/null
@@ -1,174 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl * nDev, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before reducescatter:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  allocate(hostBuffPtr(nDev + 1))
-
-  do i = 1, nDev + 1
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
-    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  print "(a)", ""
-  print "(a)", "after reduceScatter:"
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  do i = 1, nDev + 1
-    hostBuffPtr(i) = c_loc(hostBuff(1, nDev + 1))
-  end do
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(recvBuffPtr(i))
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(sendBuffPtr(i))
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/third_party/nccl/nccl b/third_party/nccl/nccl
new file mode 120000
index 0000000000..f8da0e204e
--- /dev/null
+++ b/third_party/nccl/nccl
@@ -0,0 +1 @@
+Subproject commit f93fe9bfd94884cec2ba711897222e0df5569a53
diff --git a/third_party/nccl/src/all_gather.cu b/third_party/nccl/src/all_gather.cu
deleted file mode 100644
index cb36b71796..0000000000
--- a/third_party/nccl/src/all_gather.cu
+++ /dev/null
@@ -1,202 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 2
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void AllGatherKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ T* sharedNextOutput;
-  __shared__ DevRing<T> ring;
-  bool pushrecv = args.pushrecv;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-    if (pushrecv) {
-      *ring.sendPtrToPrev = (T*)args.ThisOutput;
-      Wait([=] {
-        return *ring.recvPtrFromNext != nullptr;
-      });
-      sharedNextOutput = *ring.recvPtrFromNext;
-      *ring.recvPtrFromNext = nullptr;
-    }
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T> Prims;
-
-  const int size = args.N;
-  const int nranks = args.nRanks;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int poffset, noffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int chunkOffset = 0; chunkOffset < size; chunkOffset += sliceSize) {
-    /////////////// begin AllGather steps ///////////////
-    int offset;
-    int maxOffset = size-chunkOffset;
-    int rankDest;
-
-    // step 0: push data to next GPU
-    rankDest = ring.userRank[0];
-    offset = chunkOffset + rankDest * size;
-
-    if (thisInput == thisOutput) {
-      Prims::Copy(
-          thisInput  + offset,
-          pushrecv ? sharedNextOutput + offset : nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-    } else {
-      Prims::DoubleCopy(
-          thisInput  + chunkOffset,
-          thisOutput + offset,
-          pushrecv ? sharedNextOutput + offset : nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-    }
-
-    NEXT_STEP; // Increases step, poffset, noffset
-
-    // k-2 steps: copy to next GPU
-    if (pushrecv) {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring.userRank[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::Copy(
-            thisOutput + offset,
-            sharedNextOutput + offset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-    } else {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring.userRank[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::DoubleCopy(
-            prevInput + poffset,
-            thisOutput + offset,
-            nextOutput + noffset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-
-      // Make final copy from buffer to dest.
-      rankDest = ring.userRank[1];
-      offset = chunkOffset + rankDest * size;
-
-      // Here we need to copy from buffer to this output.
-      Prims::Copy(
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    // Wait for last update from next then reset the flag
-    waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-    *ring.recvFlagFromNext = 0;
-
-    // Wait for last update from prev then reset the flag
-    waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
-    *ring.recvFlagFromPrev = 0;
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 512
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingAllGather(const void* sendbuff, void* recvbuff,
-    const int count, ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
-  } else {
-    KernelArgs<T> args;
-    ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
-    LAUNCH_KERNEL(AllGatherKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template<typename> class RedOp>
-class AllGather {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
-    return RingAllGather<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, int count, ncclDataType_t datatype,
-    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
-    void* recvbuff, ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, "AllGather"));
-  return enqueue<AllGather, FuncNull>(sendbuff, recvbuff, count, datatype, 0, comm, stream);
-}
-
diff --git a/third_party/nccl/src/all_reduce.cu b/third_party/nccl/src/all_reduce.cu
deleted file mode 100644
index 2f38d6e5f7..0000000000
--- a/third_party/nccl/src/all_reduce.cu
+++ /dev/null
@@ -1,234 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 2
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void AllReduceKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ T* sharedNextOutput;
-  __shared__ DevRing<T> ring;
-  bool pushrecv = args.pushrecv;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-    if (pushrecv) {
-      *ring.sendPtrToPrev = (T*)args.ThisOutput;
-      Wait([=] {
-        return *ring.recvPtrFromNext != nullptr;
-      });
-      sharedNextOutput = *ring.recvPtrFromNext;
-      *ring.recvPtrFromNext = nullptr;
-    }
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T, FUNC> Prims;
-
-  const int size = args.N;
-  const int nranks = args.nRanks;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int poffset, noffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int chunkOffset = 0; chunkOffset < size; chunkOffset += nranks*sliceSize) {
-    /////////////// begin AllReduce steps ///////////////
-    int offset;
-    int maxOffset;
-    int slice;
-    int chunkSize = min(sliceSize, DIVUP(size-chunkOffset,nranks));
-    ALIGN_SIZE(chunkSize, THREADS*UNROLL);
-
-    // step 0: push data to next GPU
-    slice = ring.userRank[nranks-1];
-    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
-
-    Prims::Copy(
-        thisInput  + offset,
-        nextOutput + noffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP; // Increases step, poffset, noffset
-
-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      slice = ring.userRank[nranks-j];
-      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      Prims::Reduce(
-          prevInput  + poffset,
-          thisInput  + offset,
-          nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-
-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data and push to the next GPU
-    slice = ring.userRank[0];
-    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
-
-    Prims::ReduceCopy(
-        prevInput  + poffset,
-        thisInput  + offset,
-        pushrecv ? (sharedNextOutput + offset) : (nextOutput + noffset),
-        thisOutput + offset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP;
-
-    if (pushrecv) {
-      // k-2 steps: copy result to next GPU
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring.userRank[nranks - j];
-        offset = chunkOffset + slice * chunkSize;
-        maxOffset = min(chunkSize, size-offset);
-
-        Prims::Copy(
-            thisOutput + offset,
-            sharedNextOutput + offset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-    } else {
-      // k-2 steps: copy result to next GPU
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring.userRank[nranks - j];
-        offset = chunkOffset + slice * chunkSize;
-        maxOffset = min(chunkSize, size-offset);
-
-        Prims::DoubleCopy(
-            prevInput + poffset,
-            thisOutput + offset,
-            nextOutput + noffset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-
-      // Make final copy from buffer to dest.
-      slice = ring.userRank[1];
-      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      // Here we need to copy from buffer to this output.
-      Prims::Copy(
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    // Wait for last update from next then reset the flag
-    waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-    *ring.recvFlagFromNext = 0;
-
-    // Wait for last update from prev then reset the flag
-    waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
-    *ring.recvFlagFromPrev = 0;
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 512
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingAllReduce(const void* sendbuff, void* recvbuff,
-    const int count, ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
-  } else {
-    KernelArgs<T> args;
-    ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
-    LAUNCH_KERNEL(AllReduceKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template <typename> class RedOp>
-class AllReduce {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
-    return RingAllReduce<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, datatype, op, 0, comm, "AllReduce"));
-  return enqueue<AllReduce>(sendbuff, recvbuff, count, datatype, op, 0, comm, stream);
-}
-
diff --git a/third_party/nccl/src/broadcast.cu b/third_party/nccl/src/broadcast.cu
deleted file mode 100644
index 3a7cb119cf..0000000000
--- a/third_party/nccl/src/broadcast.cu
+++ /dev/null
@@ -1,164 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 4
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  boffset += sliceSize; \
-  if (boffset == buffSize) boffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void BroadcastKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ T* sharedNextOutput;
-  __shared__ DevRing<T> ring;
-  bool pushrecv = args.pushrecv;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-    if (pushrecv) {
-      *ring.sendPtrToPrev = (T*)args.ThisOutput;
-      Wait([=] {
-        return *ring.recvPtrFromNext != nullptr;
-      });
-      sharedNextOutput = *ring.recvPtrFromNext;
-      *ring.recvPtrFromNext = nullptr;
-    }
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, (1-NUM_BUFCHUNKS)*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, 0);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, 0);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T> Prims;
-
-  const int size = args.N;
-  const int rank = ring.userRank[0];
-  const int nextRank = ring.userRank[1];
-  const int root = args.root;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int boffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int offset = 0; offset < size; offset += sliceSize) {
-    int maxOffset = size-offset;
-    if (rank == root) {
-      Prims::Copy(
-          thisInput + offset,
-          pushrecv ? sharedNextOutput + offset : nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
-    } else if (nextRank == root) {
-      if (pushrecv) maxOffset = 0; // Only wait for signals
-      Prims::Copy(
-          prevInput  + boffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      if (pushrecv) {
-        Prims::Copy(
-            thisOutput + offset,
-            sharedNextOutput + offset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      } else {
-        Prims::DoubleCopy(
-            prevInput + boffset,
-            thisOutput + offset,
-            nextOutput + boffset,
-	    sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      }
-    }
-    NEXT_STEP; // Increases step, boffset
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    if (nextRank != root) {
-      // Wait for last update from next then reset the flag
-      waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-      *ring.recvFlagFromNext = 0;
-    }
-
-    if (rank != root) {
-      // reset the flag
-      *ring.recvFlagFromPrev = 0;
-    }
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 256
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingBroadcast(void* buff, const int count, const int root,
-    ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks != 1) {
-    KernelArgs<T> args;
-    ArgsSetup(&args, buff, buff, root, count, comm);
-    LAUNCH_KERNEL(BroadcastKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template<typename> class RedOp>
-class Broadcast {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int root, ncclComm* comm, cudaStream_t stream) {
-    return RingBroadcast<RedOp<T>, T>(recvbuff, count, root, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclBcast, void* buff, int count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(buff, buff, count, datatype, ncclSum, root, comm, "Bcast"));
-  return enqueue<Broadcast, FuncNull>(nullptr, buff, count, datatype, root, comm, stream);
-}
-
diff --git a/third_party/nccl/src/common_coll.h b/third_party/nccl/src/common_coll.h
deleted file mode 100644
index 54050f8e46..0000000000
--- a/third_party/nccl/src/common_coll.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef COMMON_COLL_H_
-#define COMMON_COLL_H_
-
-#include "core.h"
-
-static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
-  cudaPointerAttributes attr;
-  cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
-  if (err != cudaSuccess || attr.devicePointer == NULL) {
-    WARN("%s : %s is not a valid pointer\n", opname, ptrname);
-    return ncclInvalidDevicePointer;
-  }
-  if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
-    WARN("%s : %s allocated on device %d mismatchs with NCCL device %d \n", opname, ptrname, attr.device, comm->cudaDev);
-    return ncclInvalidDevicePointer;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
-  if (ptr == NULL) {
-    WARN("%s : %s argument is NULL", opname, ptrname);
-    return ncclInvalidArgument;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, int count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) {
-  NCCLCHECK(PtrCheck(comm, opname, "comm"));
-  // First, the easy ones
-  if (root < 0 || root >= comm->nRanks) {
-    WARN("%s : invalid root %d (root should be in the 0..%d range)\n", opname, root, comm->nRanks);
-    return ncclInvalidRank;
-  }
-  if (type < 0 || type >= nccl_NUM_TYPES) {
-    WARN("%s : invalid type %d\n", opname, type);
-    return ncclInvalidType;
-  }
-  if (op < 0 || op >= nccl_NUM_OPS) {
-    WARN("%s : invalid reduction operation %d\n", opname, op);
-    return ncclInvalidOperation;
-  }
-  if (count < 0) {
-    WARN("%s : invalid count %d\n", opname, count);
-    return ncclInvalidArgument;
-  }
-
-  // Check pointers
-  NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname))
-  if (strcmp(opname, "Reduce") == 0 && comm->rank != root) {
-    // No need to check recvbuff pointer for non-root reduce
-    return ncclSuccess;
-  }
-  NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname))
-  return ncclSuccess;
-}
-
-// Kernel launch
-template<typename T>
-struct KernelArgs {
-  // general parameters
-  int nRanks;
-  int root;
-  int buffSize;
-  int N;
-  int opIndex;
-  volatile int * __restrict__ opCounter;
-  int * __restrict__ doneCount;
-  bool pushrecv;
-
-  // some pre-computed sizes
-  int SliceSize;
-  int SliceOffset;
-  int ChunkSize;
-  int NumChunks;
-
-  // local and remote input, output, and buffer
-  const T * __restrict__ ThisInput;
-  T * __restrict__ ThisOutput;
-
-  DevRing<char>* ring;
-};
-
-template<typename T>
-void ArgsSetup(KernelArgs<T> *args, const void* sendbuff, void* recvbuff,
-		const int root, const int count, ncclComm *comm) {
-  args->nRanks = comm->nRanks;
-  args->root = root;
-  args->buffSize = comm->buffSize;
-  args->N = count;
-  args->opIndex = comm->opSched;
-  args->opCounter = comm->opCounter;
-  args->ThisInput = (const T*)sendbuff;
-  args->ThisOutput = (T*)recvbuff;
-  args->ring = comm->devRing;
-  args->pushrecv = comm->globalMemSpace;
-}
-
-#define LAUNCH_KERNEL(K, THREADS, UNROLL, FUNC, T, \
-		args, stream) do { \
-  dim3 grid(1, 1, 1); \
-  dim3 block(THREADS+1, 1, 1); \
-  void* argptrs[] = {&args}; \
-  CUDACHECK(cudaLaunchKernel( \
-            (void*)K<THREADS, UNROLL, FUNC, T>, \
-            grid, block, argptrs, 0, stream), ncclUnhandledCudaError); \
-} while (0)
-
-#endif
diff --git a/third_party/nccl/src/common_kernel.h b/third_party/nccl/src/common_kernel.h
deleted file mode 100644
index b96519f78a..0000000000
--- a/third_party/nccl/src/common_kernel.h
+++ /dev/null
@@ -1,362 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-
-#ifndef COMMON_KERNEL_H_
-#define COMMON_KERNEL_H_
-
-#include <cstdio>
-#include <cstdint>
-
-#include <cuda_runtime.h>
-
-// BAR macro and helpers
-#define WARP_SIZE 32
-#define ROUNDUP(x, y)                                                           \
-    (((((x) + (y) - 1) / (y))) * (y))
-#define DIVUP(x, y) \
-    (((x)+(y)-1)/(y))
-#define BAR_EXEC(type, barid, nthreads) \
-    asm("bar." #type " " #barid ", " #nthreads ";\n\t")
-#define BAR_EXPAND(type, barid, nthreads) \
-    BAR_EXEC(type, barid, (nthreads))
-
-// Named barrier macro.
-// Expands to asm("bar.type barid, nthreads") where
-// nthreads has been rounded up to WARP_SIZE.
-#define BAR(type, barid, nthreads) \
-    BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE))
-
-template<typename T> inline __device__
-T vFetch(const volatile T* ptr) {
-  return *ptr;
-}
-
-template<typename T> inline __device__
-void vStore(volatile T* ptr, const T val) {
-  *ptr = val;
-}
-
-#ifdef CUDA_HAS_HALF
-#if CUDART_VERSION < 9000
-template<> inline __device__
-half vFetch<half>(const volatile half* ptr) {
-  half r;
-  r.x = ptr->x;
-  return r;
-}
-template<> inline __device__
-void vStore<half>(volatile half* ptr, const half val) {
-  ptr->x = val.x;
-}
-#else
-template<> inline __device__
-half vFetch<half>(const volatile half* ptr) {
-  return *((half*)ptr);
-}
-template<> inline __device__
-void vStore<half>(volatile half* ptr, const half val) {
-  *((half*)ptr) = val;
-}
-#endif
-#endif
-
-__device__ unsigned int spinct;
-
-// Spin wait until func evaluates to true
-template<typename FUNC>
-__device__ inline void Wait(const FUNC& func) {
-  while (!func()) {
-    // waste time
-    atomicInc(&spinct, 10);
-  }
-}
-
-typedef uint64_t PackType;
-
-// unpack x and y to elements of type T and apply FUNC to each element
-template<class FUNC, typename T>
-struct MULTI {
-  __device__ PackType operator()(const PackType x, const PackType y) const;
-};
-
-template<class FUNC>
-struct MULTI<FUNC, char> {
-  static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
-      "PackType must be twice the size of uint32_t.");
-  union converter {
-    PackType storage;
-    struct {
-      uint32_t a, b;
-    };
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-
-    // for char, we do these as vector ops
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return cr.storage;
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, int> {
-  static_assert(sizeof(PackType) == 2 * sizeof(int),
-      "PackType must be twice the size of int.");
-  union converter {
-    PackType storage;
-    struct {
-      int a, b;
-    };
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return cr.storage;
-  }
-};
-
-#ifdef CUDA_HAS_HALF
-template<class FUNC>
-struct MULTI<FUNC, half> {
-  static_assert(sizeof(PackType) == 4 * sizeof(half),
-      "PackType must be four times the size of half.");
-
-  struct PackHalf2 {
-    half2 a, b;
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    struct PackHalf2 cx, cy, cr;
-    cx = *(reinterpret_cast<const struct PackHalf2*>(&x));
-    cy = *(reinterpret_cast<const struct PackHalf2*>(&y));
-
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return *(reinterpret_cast<PackType*>(&cr));
-  }
-};
-#endif
-
-template<class FUNC>
-struct MULTI<FUNC, float> {
-  static_assert(sizeof(PackType) == 2 * sizeof(float),
-      "PackType must be twice the size of float.");
-  union converter {
-    PackType storage;
-    struct {
-      float a, b;
-    };
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return cr.storage;
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, double> {
-  static_assert(sizeof(PackType) == sizeof(double),
-      "PackType must be the same size as double.");
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    double rv = FUNC()(__longlong_as_double(x), __longlong_as_double(y));
-    return __double_as_longlong(rv);
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, unsigned long long> {
-  static_assert(sizeof(PackType) == sizeof(unsigned long long),
-      "PackType must be the same size as unsigned long long.");
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    unsigned long long rv = FUNC()(x, y);
-    return rv;
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, long long> {
-  static_assert(sizeof(PackType) == sizeof(long long),
-      "PackType must be the same size as long long.");
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    long long rv = FUNC()((long long)x, (long long)y);
-    return rv;
-  }
-};
-
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
-__device__ inline void ReduceCopy(
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1,
-    volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1, const int idx) {
-  T val = vFetch(src0+idx);
-  if (TWO_INPUTS) {
-    val = FUNC()(val, vFetch(src1+idx));
-  }
-  vStore(dest0+idx, val);
-  if (TWO_OUTPUTS) {
-    vStore(dest1+idx, val);
-  }
-}
-
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL, int THREADS>
-__device__ inline void ReduceCopy64b(
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1,
-    volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1, const int offset) {
-  PackType t0[UNROLL];
-  PackType t1[UNROLL];
-  #pragma unroll
-  for (int u = 0; u < UNROLL; ++u) {
-    int idx = offset + u*THREADS;
-    t0[u] = (reinterpret_cast<const volatile PackType *>(src0))[idx];
-    if (TWO_INPUTS) {
-      t1[u] = (reinterpret_cast<const volatile PackType *>(src1))[idx];
-    }
-  }
-  #pragma unroll
-  for (int u = 0; u < UNROLL; ++u) {
-    int idx = offset + u*THREADS;
-    PackType val = TWO_INPUTS ? MULTI<FUNC, T>()(t0[u], t1[u]) : t0[u];
-    (reinterpret_cast<volatile PackType *>(dest0))[idx] = val;
-    if (TWO_OUTPUTS) {
-      (reinterpret_cast<volatile PackType *>(dest1))[idx] = val;
-    }
-  }
-}
-
-#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))
-
-template<typename T>
-__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
-  size_t ptrval = reinterpret_cast<size_t>(ptr);
-  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
-}
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of producer threads
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, class FUNC, typename T, bool HAS_DEST1,
-    bool HAS_SRC1>
-__device__ inline void ReduceOrCopy(const int tid,
-    volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
-    const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
-    int N) {
-  if (N<=0) {
-    return;
-  }
-
-  int Npreamble = (N<alignof(PackType)) ? N : AlignUp(dest0, alignof(PackType)) - dest0;
-
-  // stage 0: check if we'll be able to use the fast, 64-bit aligned path.
-  // If not, we'll just use the slow preamble path for the whole operation
-  bool alignable = (((AlignUp(src0,  alignof(PackType)) == src0  + Npreamble)) &&
-      (!HAS_DEST1 || (AlignUp(dest1, alignof(PackType)) == dest1 + Npreamble)) &&
-      (!HAS_SRC1  || (AlignUp(src1,  alignof(PackType)) == src1  + Npreamble)));
-
-  if (!alignable) {
-    Npreamble = N;
-  }
-
-  // stage 1: preamble: handle any elements up to the point of everything coming
-  // into alignment
-  for (int idx = tid; idx < Npreamble; idx += THREADS) {
-    // ought to be no way this is ever more than one iteration, except when
-    // alignable is false
-    ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(src0, src1, dest0, dest1, idx);
-  }
-
-  // stage 2: fast path: use 64b loads/stores to do the bulk of the work,
-  // assuming the pointers we have are all 64-bit alignable.
-  if (alignable) {
-    const int PackFactor = sizeof(PackType) / sizeof(T);
-    int Nrem = N - Npreamble;
-    dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
-    src0  += Npreamble; if (HAS_SRC1)  { src1  += Npreamble; }
-
-    // stage 2a: main loop
-    int Nalign2a = (Nrem / (PackFactor * UNROLL * THREADS))
-        * (UNROLL * THREADS); // round down
-
-    #pragma unroll 1 // don't unroll this loop
-    for (int idx = tid; idx < Nalign2a; idx += UNROLL * THREADS) {
-      ReduceCopy64b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL, THREADS>(src0, src1, dest0, dest1, idx);
-    }
-
-    int Ndone2a = Nalign2a * PackFactor;
-    Nrem -= Ndone2a;
-
-    // stage 2b: slightly less optimized for section when we don't have full
-    // UNROLLs
-
-    int Nalign2b = Nrem / PackFactor;
-
-    #pragma unroll 4
-    for (int idx = Nalign2a + tid; idx < Nalign2a + Nalign2b; idx += THREADS) {
-      ReduceCopy64b<FUNC, T, HAS_SRC1, HAS_DEST1, 1, 0>(src0, src1, dest0, dest1, idx);
-    }
-
-    int Ndone2b = Nalign2b * PackFactor;
-    Nrem -= Ndone2b;
-    int Ndone2 = Ndone2a + Ndone2b;
-    dest0 += Ndone2; if (HAS_DEST1) { dest1 += Ndone2; }
-    src0  += Ndone2; if (HAS_SRC1)  { src1  += Ndone2; }
-
-    // stage 2c: tail
-
-    for (int idx = tid; idx < Nrem; idx += THREADS) {
-      // never ought to make it more than one time through this loop.  only a
-      // few threads should even participate
-      ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(src0, src1, dest0, dest1, idx);
-    }
-  } // done fast path
-}
-
-template <typename T>
-__device__ inline void incrementOpCounter(const KernelArgs<T> *args) {
-  // increment comm's operation counts
-  __threadfence_system(); // Technically need to ensure that cleared flags
-  // are visible before incrementing op counter.
-  *args->opCounter = args->opIndex+1;
-}
-
-template <int THREADS, typename T> __device__ __forceinline__
-void LoadRing(const DevRing<char>* src, DevRing<T>* dst) {
-  enum { NUM_WORDS = sizeof(DevRing<char>) / sizeof(long long) };
-  static_assert(sizeof(DevRing<char>) % sizeof(long long) == 0, "Bad alignment");
-  static_assert(THREADS >= NUM_WORDS, "Not enough threads to load DevRing");
-  static_assert(sizeof(DevRing<char>) == sizeof(DevRing<T>), "DevRing size mismatch");
-  long long* lldst = reinterpret_cast<long long*>(dst);
-  const long long* llsrc = reinterpret_cast<const long long*>(src);
-  if (threadIdx.x < NUM_WORDS) {
-    lldst[threadIdx.x] = llsrc[threadIdx.x];
-  }
-}
-
-
-#endif // COMMON_KERNEL_H_
diff --git a/third_party/nccl/src/copy_kernel.h b/third_party/nccl/src/copy_kernel.h
deleted file mode 100644
index 0f69748fac..0000000000
--- a/third_party/nccl/src/copy_kernel.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-
-#ifndef COPY_KERNEL_H_
-#define COPY_KERNEL_H_
-
-#include "common_kernel.h"
-
-template<typename T>
-struct FuncPassA {
-  __device__ T operator()(const T x, const T y) const {
-    return x;
-  }
-};
-
-#ifdef CUDA_HAS_HALF
-template <>
-struct FuncPassA<half> {
-  __device__ half2 operator()(const half2 x, const half2 y) const {
-    return x;
-  }
-  __device__ half operator()(const half x, const half y) const {
-    return x;
-  }
-};
-#endif
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of producer threads
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, typename T>
-__device__ void Copy(volatile T * __restrict__ const dest,
-    const volatile T * __restrict__ const src, const int N) {
-  ReduceOrCopy<UNROLL, THREADS, FuncPassA<T>, T, false, false>(threadIdx.x,
-      dest, nullptr, src, nullptr, N);
-}
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of producer threads
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, typename T>
-__device__ void DoubleCopy(volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1,
-    const volatile T * __restrict__ const src, const int N) {
-  ReduceOrCopy<UNROLL, THREADS, FuncPassA<T>, T, true, false>(threadIdx.x,
-      dest0, dest1, src, nullptr, N);
-}
-
-#endif // COPY_KERNEL_H_
diff --git a/third_party/nccl/src/core.cu b/third_party/nccl/src/core.cu
deleted file mode 100644
index 1420d21c31..0000000000
--- a/third_party/nccl/src/core.cu
+++ /dev/null
@@ -1,1019 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "core.h"
-#include "libwrap.h"
-#include "common_coll.h"
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sched.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <cuda_runtime.h>
-#include <string.h>
-#include <errno.h>
-
-DebugLevel ncclDebugLevel;
-
-NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
-ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
-  NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
-  pid_t pid = getpid();
-  static int count = 0;
-  int commId = __sync_fetch_and_add(&count, 1);
-  int len = snprintf(out->internal, NCCL_UNIQUE_ID_BYTES, "nccl-%d-%d", pid, commId);
-  if(strlen(out->internal) < len) {
-    WARN("ncclUniqueId truncated");
-    return ncclInternalError;
-  }
-  return ncclSuccess;
-}
-
-
-static ncclResult_t shmOpen(const char* shmname, size_t bytes, void** ptr) {
-  int fd = shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
-  if (fd == -1) {
-    WARN("shm_open failed to open %s", shmname);
-    return ncclSystemError;
-  }
-
-  if (ftruncate(fd, bytes) == -1) {
-    WARN("ftruncate failed to allocate %ld bytes", bytes);
-    shm_unlink(shmname);
-    close(fd);
-    return ncclSystemError;
-  }
-
-  *ptr = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-  if (*ptr == MAP_FAILED) {
-    WARN("failure in mmap");
-    shm_unlink(shmname);
-    close(fd);
-    return ncclSystemError;
-  }
-
-  close(fd);
-  return ncclSuccess;
-}
-
-static ncclResult_t shmUnlink(const char* shmname) {
-  if(shm_unlink(shmname) == -1) {
-    WARN("smh_unlink failed");
-    return ncclSystemError;
-  } else {
-    return ncclSuccess;
-  }
-}
-
-static ncclResult_t shmUnmap(void* ptr, size_t bytes) {
-  if(munmap(ptr, bytes) == -1) {
-    WARN("munmap failed");
-    return ncclSystemError;
-  } else {
-    return ncclSuccess;
-  }
-}
-
-
-typedef struct {
-  int rank;
-  int ndev;
-  int cudaDev;
-  int sortId;
-  pid_t pid;
-  ncclMem* hostptr;
-  ncclMem* devptr;
-  cudaIpcMemHandle_t devipc;
-  size_t buffSize;
-} RankEntry;
-
-static int compRanks(const void* a, const void* b) {
-  const RankEntry* A = (const RankEntry*)a;
-  const RankEntry* B = (const RankEntry*)b;
-  if (A->sortId < B->sortId) return -1;
-  if (A->sortId > B->sortId) return  1;
-  return 0;
-}
-
-static void orderRanks(RankEntry* ranks, int count) {
-  qsort(ranks, count, sizeof(RankEntry), compRanks);
-}
-
-
-typedef struct {
-  union {
-    struct {
-      volatile int bar;
-      int globalMemSpaceBroke;
-    };
-    char pad[16];
-   };
-   RankEntry ranks[1];
-} RankGather;
-
-static ncclResult_t initGather(RankGather** gather, ncclUniqueId commId,
-    int ndev, int rank, RankEntry myInfo) {
-  size_t bytes = offsetof(RankGather, ranks) + ndev*sizeof(RankEntry);
-  RankGather* tmp = NULL;
-  int bar_tmp;
-
-  ncclResult_t res = shmOpen(commId.internal, bytes, (void**)&tmp);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to open shm segment for gather", rank);
-    return res;
-  }
-
-  tmp->ranks[rank] = myInfo;
-
-  bar_tmp = tmp->bar - 1;
-  bool swapped;
-  do {
-    bar_tmp += 1;
-    if (bar_tmp == ndev-1) { // everyone is done
-      ncclResult_t res = shmUnlink(commId.internal);
-      if (res != ncclSuccess) {
-        WARN("rank %d failed to unlink shm segment for gather", rank);
-        shmUnmap(tmp, bytes);
-        return res;
-      }
-
-      orderRanks(tmp->ranks, ndev);
-    }
-    swapped = __sync_bool_compare_and_swap(&tmp->bar, bar_tmp, bar_tmp+1);
-  } while(!swapped);
-
-  while (tmp->bar < ndev)
-    sched_yield();
-  __sync_synchronize();
-
-  *gather = tmp;
-  return ncclSuccess;
-}
-
-static void syncRingDirect(RankGather* gather, int* globalMemSpaceOk) {
-  int bar_tmp = gather->bar - 1;
-  int ndev = gather->ranks[0].ndev;
-  bool swapped;
-  do {
-    bar_tmp += 1;
-    swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
-  } while(!swapped);
-
-  while (gather->bar < 2*ndev) // Wait for all ranks to arrive at this second barrier
-    sched_yield();
-  __sync_synchronize();
-
-  *globalMemSpaceOk = gather->globalMemSpaceBroke ? 0 : 1;
-}
-
-static ncclResult_t closeGather(RankGather* gather, int ndev) {
-  int bar_tmp = gather->bar - 1;
-  bool swapped;
-  do {
-    bar_tmp += 1;
-    swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
-  } while(!swapped);
-
-  while (gather->bar < 3*ndev) // Wait for all ranks to arrive at this third barrier
-    sched_yield();
-  __sync_synchronize();
-
-  size_t bytes = offsetof(RankGather, ranks) + ndev*sizeof(RankEntry);
-  ncclResult_t res = shmUnmap(gather, bytes);
-  if (res != ncclSuccess) {
-    WARN("failed to unmap %ld bytes of gather", bytes);
-    return res;
-  }
-
-  return ncclSuccess;
-}
-
-
-static ncclResult_t allocDevMem(ncclMem** ptr, size_t buffSize) {
-  size_t size = offsetof(struct ncclMem, buff) + buffSize;
-  cudaError_t res = cudaMalloc((void**)ptr, size);
-  if (res != cudaSuccess) {
-    *ptr = NULL;
-    WARN("failed to allocate %lu byte device buffer", size);
-    return ncclCudaMallocFailed;
-  }
-  if (cudaMemset(*ptr, 0, size) != cudaSuccess) {
-    WARN("failed to memset device buffer.");
-    cudaFree(*ptr);
-    *ptr = NULL;
-    return ncclUnhandledCudaError;
-  }
-  return ncclSuccess;
-}
-
-static const int ShmMapped = 1;
-static const int ShmLinked = 2;
-
-static ncclResult_t allocHostMem(ncclMem** ptr, size_t buffSize) {
-  size_t size = offsetof(struct ncclMem, buff) + buffSize;
-  cudaError_t res = cudaMallocHost((void**)ptr, size);
-  if (res != cudaSuccess) {
-    *ptr = NULL;
-    WARN("failed to allocate %lu byte host buffer", size);
-    return ncclSystemError;
-  }
-  memset(*ptr, 0, size);
-  return ncclSuccess;
-}
-
-static ncclResult_t openHostMemShm(const char* shmname, ncclMem** ptr, size_t buffSize) {
-  size_t size = offsetof(struct ncclMem, buff) + buffSize;
-  ncclResult_t res = shmOpen(shmname, size, (void**)ptr);
-  if (res != ncclSuccess) {
-    WARN("failed to allocate %lu byte shm buffer", size);
-    *ptr = NULL;
-    return res;
-  }
-
-  if(cudaHostRegister(*ptr, size, cudaHostRegisterMapped) != cudaSuccess) {
-    WARN("failed to register host buffer");
-    shmUnlink(shmname);
-    shmUnmap(*ptr, size);
-    *ptr = NULL;
-    return ncclUnhandledCudaError;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t populateRankInfo(RankEntry* info, int rank, ncclComm_t comm) {
-  char busId[13];
-  nvmlDevice_t nvmlHandle;
-  cudaError_t res = cudaDeviceGetPCIBusId(busId, 13, comm->cudaDev);
-  if (res == cudaErrorInvalidDevice) {
-    WARN("rank %d attempted to access an invalid cuda device %d", rank, comm->cudaDev);
-    return ncclInvalidDeviceIndex;
-  } else if (res != cudaSuccess) {
-    WARN("rank %d failed to get PCI Bus Id for device %d", rank, comm->cudaDev);
-    return ncclUnhandledCudaError;
-  }
-  INFO("rank %d using device %d (%s)", rank, comm->cudaDev, busId);
-
-  if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlHandle) != ncclSuccess) {
-    WARN("rank %d failed to get nvml handle for device %s", rank, busId);
-    return ncclUnhandledCudaError;
-  }
-  // Order by nvml index
-  if (wrapNvmlDeviceGetIndex(nvmlHandle, (unsigned*)&info->sortId) != ncclSuccess) {
-    WARN("rank %d failed to get nvml device index for device %d", rank, comm->cudaDev);
-    return ncclUnhandledCudaError;
-  }
-
-  info->rank = rank;
-  info->ndev = comm->nRanks;
-  info->cudaDev = comm->cudaDev;
-  info->pid = getpid();
-  info->buffSize = comm->buffSize;
-  info->hostptr = comm->hostMem;
-  info->devptr = comm->devMem;
-  if (cudaIpcGetMemHandle(&info->devipc, (void*)comm->devMem) != cudaSuccess) {
-    WARN("rank %d failed to open CUDA IPC handle", rank);
-    return ncclUnhandledCudaError;
-  }
-
-  return ncclSuccess;
-}
-
-
-static ncclResult_t commClearMaps(ncclComm_t comm) {
-  ncclResult_t res, retval = ncclSuccess;
-  cudaError_t cures;
-
-  for(int d=0; d<comm->nRanks; ++d) {
-    if (comm->ptrs[d].hostCleanup != NULL) {
-      cures = cudaHostUnregister(comm->ptrs[d].hostCleanup);
-      if (cures != cudaSuccess) {
-        WARN("rank %d failed to unregister handle to device %d",
-          comm->rank, d);
-          retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
-      }
-      res = shmUnmap(comm->ptrs[d].hostCleanup, offsetof(ncclMem, buff) + comm->buffSize);
-      if (res != ncclSuccess) {
-        WARN("rank %d failed to unmap handle to device %d",
-          comm->rank, d);
-          retval = (retval == ncclSuccess) ? res : retval;
-      }
-      comm->ptrs[d].hostCleanup = NULL;
-    }
-
-    if (comm->ptrs[d].devCleanup != NULL) {
-      cures = cudaIpcCloseMemHandle((void*)comm->ptrs[d].devCleanup);
-      if (cures != cudaSuccess) {
-        WARN("rank %d failed to close IPC handle to device %d: %s",
-          comm->rank, d, cudaGetErrorString(cures));
-        retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
-      }
-    }
-  }
-
-  if (comm->userFromRing != NULL)
-    memset(comm->userFromRing, 0, sizeof(int)*comm->nRanks);
-  if (comm->ncclFromRing != NULL)
-    memset(comm->ncclFromRing, 0, sizeof(int)*comm->nRanks);
-
-  if (comm->devUserFromRing != NULL) {
-    cures = cudaMemset(comm->devUserFromRing, 0, sizeof(int)*comm->nRanks);
-    if (cures != cudaSuccess) {
-      WARN("Faild to clear dev map: %s", cudaGetErrorString(cures));
-      retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
-    }
-  }
-
-  if (comm->devRing != NULL) {
-    cures = cudaMemset(comm->devRing, 0, sizeof(DevRing<char>));
-    if (cures != cudaSuccess) {
-      WARN("Failed to clear devRing: %s", cudaGetErrorString(cures));
-      retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
-    }
-  }
-  return retval;
-}
-
-static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int rank, RankEntry* ranks, int* globalMemSpaceBroke) {
-  int ndev = comm->nRanks;
-  comm->rank = rank;
-
-  if (ndev > MAXRANKS) {
-    WARN("%d ranks exceeds MAXRANKS of %d", ndev, MAXRANKS);
-    return ncclUnsupportedDeviceCount;
-  }
-
-  // Check for inconsistencies between ranks
-  // If two ranks use the same rank, then one slot of
-  // ranks[] will be left unset with zero ndev/buffSize.
-  for(int i=0; i<ndev; ++i) {
-    if (ranks[i].buffSize != comm->buffSize
-        || ranks[i].ndev != comm->nRanks) {
-      commClearMaps(comm);
-      return ncclRankMismatch;
-    }
-  }
-
-  // Find self among ranks of gather
-  int myNcclId = -1;
-  for (int i=0; i<ndev; ++i) {
-    if(ranks[i].rank == rank) {
-      myNcclId = i;
-      break;
-    }
-  }
-  if (myNcclId == -1) {
-    WARN("rank %d not found in communicator", rank);
-    return ncclInvalidRank;
-  }
-
-  for(int ringPos=0; ringPos<ndev; ++ringPos) {
-    int ncclPos = (ringPos+myNcclId) % ndev; // ring order relative to self
-    int userRank = ranks[ncclPos].rank;
-    comm->userFromRing[ringPos] = userRank;
-    comm->ncclFromRing[ringPos] = ncclPos;
-  }
-
-  int myDev = ranks[myNcclId].cudaDev;
-  pid_t myPid = ranks[myNcclId].pid;
-
-  for (int i=0; i<ndev; ++i) {
-    int iRank = ranks[i].rank;
-    int iDev = ranks[i].cudaDev;
-    pid_t iPid = ranks[i].pid;
-    int canpeer = 0;
-
-    int iIsNeighbor = (i == (myNcclId+1)%ndev) || (i == (myNcclId+ndev-1)%ndev);
-
-    if (iIsNeighbor && cudaDeviceCanAccessPeer(&canpeer, myDev, iDev) != cudaSuccess) {
-      INFO("peer query failed between rank %d (dev %d) and rank %d (dev %d)",
-        rank, myDev, iRank, iDev);
-      canpeer = 0;
-    }
-
-    cudaError_t err;
-    ncclMem* remoteHostBuff;
-
-    comm->ptrs[i].type = NodeRef::HOST; // Assume host buffer
-    comm->ptrs[i].devCleanup = NULL;
-    comm->ptrs[i].hostCleanup = NULL;
-
-    if (iPid == myPid) {
-      remoteHostBuff = ranks[i].hostptr;
-
-      if (myDev == iDev) { // shared device
-        INFO("rank access %d -> %d via common device", rank, iRank);
-        comm->ptrs[i].type = NodeRef::DEVICE;
-        comm->ptrs[i].local = ranks[myNcclId].devptr;
-        comm->ptrs[i].remote = ranks[i].devptr;
-      } else if (canpeer) {
-        INFO("rank access %d -> %d via P2P device mem", rank, iRank);
-        err = cudaDeviceEnablePeerAccess(iDev, 0);
-        if (err == cudaErrorPeerAccessAlreadyEnabled) {
-          cudaGetLastError();
-        } else if (err != cudaSuccess) {
-          WARN("rank %d failed to peer with device %d: %s",
-              rank, iDev, cudaGetErrorString(err));
-          commClearMaps(comm);
-          return ncclUnhandledCudaError;
-        }
-        comm->ptrs[i].type = NodeRef::DEVICE;
-        comm->ptrs[i].local = ranks[myNcclId].devptr;
-        comm->ptrs[i].remote = ranks[i].devptr;
-      }
-    } else { // Separate processes
-      *globalMemSpaceBroke = 1;
-      char rankname[1024];
-      sprintf(rankname, "%s-%d", commId->internal, ranks[i].rank);
-      if (openHostMemShm(rankname, &remoteHostBuff, ranks[i].buffSize)
-          != ncclSuccess) {
-        WARN("rank %d failed to open sysmem buffer of rank %d", rank, iRank);
-        commClearMaps(comm);
-        return ncclUnhandledCudaError;
-      }
-      comm->ptrs[i].hostCleanup = remoteHostBuff;
-
-      // TODO: Extend to same device (MPS) case.
-      // At present that would go through host mem.
-      if (canpeer) {
-        INFO("rank access %d -> %d via IPC device mem", rank, iRank);
-        comm->ptrs[i].type = NodeRef::DEVICE;
-        comm->ptrs[i].local  = ranks[myNcclId].devptr;
-        err = cudaIpcOpenMemHandle((void**)(&comm->ptrs[i].remote),
-            ranks[i].devipc, cudaIpcMemLazyEnablePeerAccess);
-        if (err != cudaSuccess) {
-          WARN("rank %d failed to open Ipc handle to rank %d: %s",
-              rank, iRank, cudaGetErrorString(err));
-          commClearMaps(comm);
-          return ncclUnhandledCudaError;
-        }
-        comm->ptrs[i].devCleanup = comm->ptrs[i].remote;
-      }
-    }
-
-    err = cudaHostGetDevicePointer(&comm->ptrs[i].opCounter,
-          &(remoteHostBuff->opCounter), 0);
-    if (err != cudaSuccess) {
-      WARN("rank %d failed to obtain %d's zero copy pointer: %s",
-          rank, iRank, cudaGetErrorString(err));
-      commClearMaps(comm);
-      return ncclUnhandledCudaError;
-    }
-
-    if (comm->ptrs[i].type == NodeRef::HOST) {
-      *globalMemSpaceBroke = 1;
-      INFO("rank access %d -> %d via zero-copy host mem", rank, iRank);
-      if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myNcclId].hostptr, 0) != cudaSuccess) {
-        WARN("rank %d failed to map zero copy buffer to device", rank);
-        commClearMaps(comm);
-        return ncclUnhandledCudaError;
-      }
-      if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, remoteHostBuff, 0) != cudaSuccess) {
-        WARN("rank %d failed to map %d's zero copy buffer to device", rank, iRank);
-        commClearMaps(comm);
-        return ncclUnhandledCudaError;
-      }
-    }
-  }
-
-  // Setup device-side ring view
-  if (cudaMemcpy(comm->devUserFromRing, comm->userFromRing, ndev*sizeof(int),
-      cudaMemcpyHostToDevice) != cudaSuccess) {
-    WARN("rank %d failed to copy maps to device", rank);
-    commClearMaps(comm);
-    return ncclUnhandledCudaError;
-  }
-
-  DevRing<char> ringTemp;
-  memcpy(ringTemp.userRank, comm->userFromRing, ndev*sizeof(int));
-
-  int prevIdx = comm->ncclFromRing[comm->nRanks-1];
-  int nextIdx = comm->ncclFromRing[1 % comm->nRanks];
-  NodeRef* prevPtrs = comm->ptrs+prevIdx;
-  NodeRef* nextPtrs = comm->ptrs+nextIdx;
-
-  ringTemp.prevOpCounter    = prevPtrs->opCounter;
-  ringTemp.nextOpCounter    = nextPtrs->opCounter;
-  ringTemp.sendFlagToNext   = nextPtrs->remote->flags;
-  ringTemp.recvFlagFromPrev = prevPtrs->local->flags;
-  ringTemp.sendFlagToPrev   = prevPtrs->remote->flags+1;
-  ringTemp.recvFlagFromNext = nextPtrs->local->flags+1;
-
-  ringTemp.recvPtrFromNext = (char**)&nextPtrs->local->recvPtrs;
-  ringTemp.sendPtrToPrev   = (char**)&prevPtrs->remote->recvPtrs;
-
-  ringTemp.recvBuffer = prevPtrs->local->buff;
-  ringTemp.sendBuffer = nextPtrs->remote->buff;
-
-  if (cudaMemcpy(comm->devRing, &ringTemp, sizeof(ringTemp),
-      cudaMemcpyHostToDevice) != cudaSuccess) {
-    WARN("rank %d failed to copy ring maps to device", rank);
-    commClearMaps(comm);
-    return ncclUnhandledCudaError;
-  }
-
-  return ncclSuccess;
-}
-
-static void initDebug() {
-  const char* nccl_debug = getenv("NCCL_DEBUG");
-  if (nccl_debug == NULL) {
-    ncclDebugLevel = NONE;
-  } else if (strcmp(nccl_debug, "VERSION") == 0) {
-    ncclDebugLevel = VERSION;
-  } else if (strcmp(nccl_debug, "WARN") == 0) {
-    ncclDebugLevel = WARN;
-  } else if (strcmp(nccl_debug, "INFO") == 0) {
-    ncclDebugLevel = INFO;
-    INFO("NCCL debug level set to INFO");
-  } else if (strcmp(nccl_debug, "ABORT") == 0) {
-    ncclDebugLevel = ABORT;
-    INFO("NCCL debug level set to ABORT");
-  }
-}
-
-static void commFree(ncclComm_t comm) {
-  if (comm == NULL)
-    return;
-
-  if (comm->doneEvent != NULL)
-    if (cudaEventDestroy(comm->doneEvent) != cudaSuccess)
-      INFO("ncclComm failed to destroy doneEvent");
-
-  ncclResult_t res = commClearMaps(comm);
-  if (res != ncclSuccess)
-    INFO("failed to cleanup comm maps");
-
-  if (comm->devRing != NULL)
-    if (cudaFree(comm->devRing) != cudaSuccess)
-      INFO("commFree failed to free devRing");
-
-  if (comm->userFromRing != NULL)
-    free(comm->userFromRing);
-
-  if (comm->devUserFromRing != NULL)
-    if (cudaFree(comm->devUserFromRing) != cudaSuccess)
-      INFO("commFree failed to free dev maps");
-
-  if (comm->ncclFromRing != NULL)
-    free(comm->ncclFromRing);
-
-  if (comm->devMem != NULL && cudaFree(comm->devMem) != cudaSuccess)
-    INFO("Failed to free devMap");
-
-  if (comm->hostMem != NULL) {
-    if (comm->hostMemState & ShmMapped) {
-      if (cudaHostUnregister(comm->hostMem) != cudaSuccess)
-        INFO("Failed to unregister hostMem");
-      size_t size = offsetof(ncclMem, buff) + comm->buffSize;
-      if (shmUnmap(comm->hostMem, size) != ncclSuccess)
-        INFO("Failed to unmap hostMem");
-      comm->hostMemState ^= ShmMapped;
-    } else {
-      cudaFreeHost(comm->hostMem);
-    }
-  }
-  free(comm);
-}
-
-static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, const ncclUniqueId* commId, int rank) {
-  size_t commBytes = offsetof(ncclComm, ptrs) + ndev*sizeof(NodeRef);
-  struct ncclComm* comm = (struct ncclComm*)malloc(commBytes);
-  if (comm == NULL) {
-    WARN("comm allocation failed");
-    return ncclSystemError;
-  }
-  memset(comm, 0, commBytes);
-
-  comm->nRanks = ndev;
-  cudaGetDevice(&comm->cudaDev);
-
-  const char* str = getenv("NCCL_BUFFSIZE");
-  int buffsize;
-  if (str != NULL) {
-    errno = 0;
-    buffsize = strtol(str, NULL, 10);
-    if (errno == ERANGE || buffsize == 0) {
-      INFO("rank %d invalid NCCL_BUFFSIZE: %s, using default %lu",
-          rank, str, DEFAULT_BUFFER_SIZE_BYTES);
-      buffsize = DEFAULT_BUFFER_SIZE_BYTES;
-    }
-  } else {
-    buffsize = DEFAULT_BUFFER_SIZE_BYTES;
-  }
-  comm->buffSize = buffsize;
-  INFO("rank %d using buffSize = %lu", rank, comm->buffSize);
-
-
-  ncclResult_t res;
-  res = allocDevMem(&comm->devMem, comm->buffSize);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to allocate device buffer", rank);
-    commFree(comm);
-    return res;
-  }
-
-  if (cudaMalloc(&comm->devRing, sizeof(DevRing<char>)) != cudaSuccess) {
-    WARN("rank %d failed to allocate device-side ring views", rank);
-    commFree(comm);
-    return ncclCudaMallocFailed;
-  }
-
-  if (cudaMalloc(&comm->devUserFromRing, ndev*sizeof(int)) != cudaSuccess ) {
-    WARN("rank %d failed to allocated device maps", rank);
-    commFree(comm);
-    return ncclCudaMallocFailed;
-  }
-
-  comm->userFromRing = (int*)malloc(ndev*sizeof(int));
-  if (comm->userFromRing == NULL) {
-    WARN("rank %d failed to allocate host maps", rank);
-    commFree(comm);
-    return ncclSystemError;
-  }
-
-  comm->ncclFromRing = (int*)malloc(ndev*sizeof(int));
-  if (comm->ncclFromRing == NULL) {
-    WARN("rank %d failed to allocate host maps", rank);
-    commFree(comm);
-    return ncclSystemError;
-  }
-
-  if (cudaEventCreateWithFlags(&comm->doneEvent, cudaEventDisableTiming) != cudaSuccess) {
-    WARN("ncclComm on rank %d failed to create doneEvent", rank);
-    commFree(comm);
-    return ncclUnhandledCudaError;
-  }
-
-  if(commId == NULL) {
-    comm->hostMemState = 0;
-    res = allocHostMem(&comm->hostMem, comm->buffSize);
-  } else {
-    char rankname[1024];
-    sprintf(rankname, "%s-%d", commId->internal, rank);
-    res = openHostMemShm(rankname, &comm->hostMem, comm->buffSize);
-    if (res != ncclSuccess) {
-      WARN("rank %d failed to allocate host buffer", rank);
-      commFree(comm);
-      return res;
-    }
-    comm->hostMemState = ShmMapped | ShmLinked;
-  }
-
-  if (cudaHostGetDevicePointer(&comm->opCounter, &comm->hostMem->opCounter, 0) != cudaSuccess) {
-    WARN("ncclComm on rank %d failed to map opCounter to device", rank);
-    commFree(comm);
-    return ncclUnhandledCudaError;
-  }
-
-  *comret = comm;
-  return ncclSuccess;
-}
-
-static ncclResult_t devCommUpdate(ncclComm_t comm) {
-  // Copy the comm on the device
-  size_t commBytes = offsetof(ncclComm, ptrs) + comm->nRanks*sizeof(NodeRef);
-  if (cudaMemcpy(comm->devComm, comm, commBytes, cudaMemcpyHostToDevice) != cudaSuccess) {
-    WARN("failed to copy device comm");
-    return ncclUnhandledCudaError;
-  }
-  // Fix the host pointer to be accessible from the device
-  void* dptr;
-  if (cudaHostGetDevicePointer(&dptr, comm->hostMem, 0) != cudaSuccess) {
-    WARN("failed to get device pointer for host mem");
-    return ncclUnhandledCudaError;
-  }
-  if (cudaMemcpy(&comm->devComm->hostMem, &dptr, sizeof(dptr), cudaMemcpyHostToDevice) != cudaSuccess) {
-    WARN("failed to update host pointer");
-    return ncclUnhandledCudaError;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t devCommSetup(ncclComm_t comm) {
-  // Fully duplicate the comm on the device
-  size_t commBytes = offsetof(ncclComm, ptrs) + comm->nRanks*sizeof(NodeRef);
-  if (cudaMalloc(&comm->devComm, commBytes) != cudaSuccess) {
-    WARN("failed to allocated device comm");
-    return ncclCudaMallocFailed;
-  }
-  return devCommUpdate(comm);
-}
-
-static ncclResult_t commUnlinkHostMem(ncclComm_t comm, ncclUniqueId commId, int rank) {
-  char rankname[1024];
-  sprintf(rankname, "%s-%d", commId.internal, rank);
-  if (comm->hostMemState & ShmLinked)
-    comm->hostMemState ^= ShmLinked;
-  return shmUnlink(rankname);
-}
-
-static void showVersion() {
-  static int shown = 0;
-  if (shown == 0 && ncclDebugLevel >= VERSION) {
-    printf("NCCL version %d.%d.%d compiled with CUDA %d.%d\n", NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH, CUDA_MAJOR, CUDA_MINOR);
-    fflush(stdout);
-    shown = 1;
-  }
-}
-
-NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
-ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) {
-  if (myrank == 0) showVersion();
-
-  NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
-
-  if (ndev < 1) {
-    WARN("Invalid device count requested : %d", ndev);
-    return ncclUnsupportedDeviceCount;
-  }
-  if (myrank >= ndev || myrank < 0) {
-    WARN("Invalid rank %d, should be in the range 0..%d", myrank, ndev-1);
-    return ncclInvalidRank;
-  }
-
-  if (strlen(commId.internal) < 1 ||
-      strlen(commId.internal) >= NCCL_UNIQUE_ID_BYTES) {
-    WARN("rank %d invalid commId", myrank);
-    return ncclInvalidArgument;
-  }
-
-  initDebug();
-  ncclResult_t res;
-  RankEntry myStuff;
-  RankGather* gath = NULL;
-
-  res = wrapSymbols();
-  if (res != ncclSuccess) {
-    WARN("NCCL failed to initialize client libs");
-    return res;
-  }
-
-  res = wrapNvmlInit();
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to initialize nvml", myrank);
-    return res;
-  }
-
-  res = commAlloc(newcomm, ndev, &commId, myrank);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to allocate communicator", myrank);
-    return res;
-  }
-
-  res = populateRankInfo(&myStuff, myrank, *newcomm);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to obtain rank info", myrank);
-    goto cleanup;
-  }
-
-  res = initGather(&gath, commId, ndev, myrank, myStuff);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to gather rank info", myrank);
-    goto cleanup;
-  }
-
-  res = commBuildMaps(*newcomm, &commId, myrank, gath->ranks, &gath->globalMemSpaceBroke);
-  syncRingDirect(gath, &((*newcomm)->globalMemSpace));
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to build comm maps", myrank);
-    goto cleanup;
-  }
-
-  INFO("Global device memory space is %s", (*newcomm)->globalMemSpace ? "enabled" : "disabled");
-
-  res = closeGather(gath, ndev); // includes a barrier
-  gath = NULL;
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to close gather", myrank);
-    goto cleanup;
-  }
-
-  res = devCommSetup(*newcomm);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to copy dcomm", myrank);
-    goto cleanup;
-  }
-
-  res = ncclSuccess;
-  goto final;
-
-  cleanup:
-  if (gath != NULL)
-    closeGather(gath, ndev);
-  commFree(*newcomm);
-
-  final:
-  if ((*newcomm)->hostMemState & ShmLinked) {
-    if (commUnlinkHostMem(*newcomm, commId, myrank) != ncclSuccess)
-      INFO("rank %d failed to unlink host mem shm segment", myrank);
-  }
-
-  if (wrapNvmlShutdown() != ncclSuccess)
-    INFO("rank %d did not shutdown nvml properly", myrank);
-  return res;
-}
-
-NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
-ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
-  initDebug();
-
-  showVersion();
-
-  NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
-
-  if (ndev < 1) {
-    WARN("Invalid device count requested : %d", ndev);
-    return ncclUnsupportedDeviceCount;
-  }
-
-  ncclResult_t res;
-  int savedDevice;
-  RankEntry* ranks = NULL;
-  int rank, cudaDev;
-  ncclComm_t comm = NULL;
-  char busId[13];
-  nvmlDevice_t nvmlHandle;
-  int affinity_set = 0;
-  int globalMemSpaceBroke = 0; // Assume direct access to recv ptr OK
-
-  res = wrapSymbols();
-  if (res != ncclSuccess) {
-    WARN("NCCL failed to initialize client libs");
-    return res;
-  }
-
-  cudaGetDevice(&savedDevice);
-  ranks = (RankEntry*)malloc(ndev*sizeof(RankEntry));
-  if (ranks == NULL) {
-    WARN("NCCL allocation failed");
-    return ncclSystemError;
-  }
-  memset(ranks, 0, ndev*sizeof(RankEntry));
-
-  res = wrapNvmlInit();
-  if (res != ncclSuccess) {
-    WARN("nccl failed to initialize nvml");
-    return res;
-  }
-
-  for(rank=0; rank<ndev; ++rank)
-    comms[rank] = NULL;
-
-  for (rank=0; rank<ndev; ++rank) {
-    cudaDev = (devlist == NULL) ? rank : devlist[rank];
-    if (cudaSetDevice(cudaDev) != cudaSuccess) {
-      WARN("rank %d failed to set cuda device %d", rank, cudaDev);
-      res = ncclInvalidDeviceIndex;
-      goto cleanup;
-    }
-
-    // Set CPU affinity
-    affinity_set = 0;
-    if (cudaDeviceGetPCIBusId(busId, 13, cudaDev) != cudaSuccess) {
-      INFO("rank %d failed to get PCI Bus Id for device %d", rank, cudaDev);
-      goto skipaffinity;
-    }
-    if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlHandle) != ncclSuccess) {
-      INFO("rank %d failed to get nvml handle for device %s", rank, busId);
-      goto skipaffinity;
-    }
-    if (wrapNvmlDeviceSetCpuAffinity(nvmlHandle) != ncclSuccess) {
-      INFO("rank %d failed to set affinity", rank);
-      goto skipaffinity;
-    }
-    affinity_set = 1;
-    skipaffinity:
-
-    res = commAlloc(&comm, ndev, NULL, rank);
-    if (res != ncclSuccess) {
-      WARN("rank %d failed to allocate communicator", rank);
-      goto cleanup;
-    }
-    comms[rank] = comm;
-
-    if (affinity_set && wrapNvmlDeviceClearCpuAffinity(nvmlHandle) != ncclSuccess) {
-      INFO("rank %d set but failed to clear cpu affinity", rank);
-    }
-    res = populateRankInfo(ranks+rank, rank, comm);
-    if (res != ncclSuccess) {
-      WARN("rank %d failed to obtain rank info", rank);
-      goto cleanup;
-    }
-  }
-
-  orderRanks(ranks, ndev);
-  for(rank=0; rank<ndev; ++rank) {
-    comm = comms[rank];
-    cudaSetDevice(comm->cudaDev);
-    res = commBuildMaps(comm, NULL, rank, ranks, &globalMemSpaceBroke);
-    if (res != ncclSuccess) {
-      WARN("rank %d failed to build comm maps", rank);
-      goto cleanup;
-    }
-  }
-
-  INFO("Global device memory space is %s", (globalMemSpaceBroke) ? "disabled" : "enabled");
-  for(rank=0; rank<ndev; ++rank) {
-    comms[rank]->globalMemSpace = globalMemSpaceBroke ? 0 : 1;
-  }
- 
-  for(rank=0; rank<ndev; ++rank) {
-    res = devCommSetup(comms[rank]);
-    if (res != ncclSuccess) {
-      WARN("rank %d failed to copy dcomm", rank);
-      goto cleanup;
-    }
-  }
-
-  free(ranks);
-  ranks = NULL;
-  res = ncclSuccess;
-  goto final;
-
-  cleanup:
-  if (ranks != NULL)
-    free(ranks);
-  for(rank=0; rank<ndev; ++rank) {
-    if(comms[rank] != NULL) {
-      commFree(comms[rank]);
-    }
-  }
-
-  final:
-  if(wrapNvmlShutdown() != ncclSuccess)
-    INFO("NCCL did not shutdown nvml properly");
-  cudaSetDevice(savedDevice);
-  return res;
-}
-
-NCCL_API(void, ncclCommDestroy, ncclComm_t comm);
-void ncclCommDestroy(ncclComm_t comm) {
-  if (comm == NULL)
-    return;
-
-  int savedDevice;
-  cudaGetDevice(&savedDevice);
-  int commDevice = comm->cudaDev;
-
-  if (savedDevice != commDevice) {
-    CUDACHECK(cudaSetDevice(commDevice), void());
-  }
-
-  commFree(comm);
-
-  if (savedDevice != commDevice)
-    cudaSetDevice(savedDevice);
-}
-
-NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
-const char* ncclGetErrorString(ncclResult_t code) {
-  switch (code) {
-  case ncclSuccess                : return "no error";
-  case ncclUnhandledCudaError     : return "unhandled cuda error";
-  case ncclSystemError            : return "system error";
-  case ncclInternalError          : return "internal error";
-  case ncclInvalidDevicePointer   : return "invalid device pointer";
-  case ncclInvalidRank            : return "invalid rank";
-  case ncclUnsupportedDeviceCount : return "unsupported device count";
-  case ncclDeviceNotFound         : return "device not found";
-  case ncclInvalidDeviceIndex     : return "invalid device index";
-  case ncclLibWrapperNotSet       : return "lib wrapper not initialized";
-  case ncclCudaMallocFailed       : return "cuda malloc failed";
-  case ncclRankMismatch           : return "parameter mismatch between ranks";
-  case ncclInvalidArgument        : return "invalid argument";
-  case ncclInvalidType            : return "invalid data type";
-  case ncclInvalidOperation       : return "invalid reduction operations";
-  }
-  return "unknown result code";
-}
-
-NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
-ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
-  NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
-  NCCLCHECK(PtrCheck(count, "CommCount", "count"));
-  *count = comm->nRanks;
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
-ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
-  NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
-  NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
-  *devid = comm->cudaDev;
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
-ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
-  NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
-  NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
-  *rank = comm->rank;
-  return ncclSuccess;
-}
-
diff --git a/third_party/nccl/src/core.h b/third_party/nccl/src/core.h
deleted file mode 100644
index 17794d7873..0000000000
--- a/third_party/nccl/src/core.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef CORE_H_
-#define CORE_H_
-
-
-#include "nccl.h"
-#include <cstdio>
-#include <cuda_runtime.h>
-
-#define MAXRANKS 32
-#define DEFAULT_BUFFER_SIZE_BYTES (1UL << 21)
-#define NCCL_MEM_PAD_ALIGN 65536
-
-
-struct ncclMem {
-  union { // Pad this block so that devBuff is correctly aligned
-    struct {
-      int   flags[2];
-      void* recvPtrs;
-      int   opCounter; // Used to determine when remote Communicators are ready.
-                       // Only used in host memory.
-    };
-    char pad[NCCL_MEM_PAD_ALIGN];
-  };
-  // devBuff will be bigger ; we only use its offset/address.
-  char buff[1];
-};
-
-template <typename T>
-struct alignas(long long) DevRing {
-  volatile int* __restrict__ prevOpCounter;
-  volatile int* __restrict__ nextOpCounter;
-  volatile int* __restrict__ sendFlagToNext;
-  volatile int* __restrict__ sendFlagToPrev;
-  volatile int* __restrict__ recvFlagFromNext;
-  volatile int* __restrict__ recvFlagFromPrev;
-
-  T* volatile * __restrict__ recvPtrFromNext;
-  T* volatile * __restrict__ sendPtrToPrev;
-  T*   __restrict__ recvBuffer;
-  T*   __restrict__ sendBuffer;
-
-  int userRank[MAXRANKS];
-};
-
-struct NodeRef {
-  ncclMem* remote; // TODO: Verify if these
-  ncclMem* local;  //       are still needed.
-  enum {DEVICE, HOST} type;
-  ncclMem* devCleanup;  // Used only when remote comm uses same process & GPU
-  ncclMem* hostCleanup; // Used whenever target is in different process
-  int* opCounter; // TODO: see if this can be removed too.
-};
-
-
-struct ncclComm {
-  int rank;    // my rank in the communicator
-  int nRanks;  // number of GPUs in communicator
-  int cudaDev; // my cuda device index
-
-  // Device and Host allocated chunks. Stored here to correctly free() memory.
-  ncclMem* devMem;
-  ncclMem* hostMem;
-  int hostMemState;
-  int opSched; // Scheduling operation index
-  int* opCounter; // Counter of completed operations
-
-  cudaStream_t prevStream; // cache last used stream
-  cudaEvent_t doneEvent; // orders operations in different streams
-
-  // Maps an internal nccl index to user-specified rank order. This is necessary
-  // since we need to know how the user expects data to be ordered across
-  // devices. Ordered from current device.
-  int* userFromRing;
-
-  // copy of the above stored on each device
-  int* devUserFromRing;
-
-  // Ring order
-  int* ncclFromRing; // TODO: REMOVE IF NOT NEEDED BEYOND CORE.CU
-
-  // Size of temp buffer in bytes.
-  size_t buffSize;
-
-  // Whether we have remote access to the recvbuff pointers passed from remote
-  // GPUs. In single process mode this can be used as long as QPI links are
-  // not present. In multi-process, we never push to a remote recvbuff.
-  int globalMemSpace;
-
-  // Device copy of the communicator
-  struct ncclComm *devComm;  // TODO: Remove this if not useful
-
-  // Device-side ring view
-  DevRing<char>* devRing;
-
-  // Device-to-device communication structures to access remote or local device
-  // memory. Actual allocation larger than 1.
-  NodeRef ptrs[1];
-};
-
-
-typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4} DebugLevel;
-extern DebugLevel ncclDebugLevel;
-
-#define WARN(...) do {                                           \
-  if (ncclDebugLevel >= WARN) {                                  \
-    printf("WARN %s:%d ", __FILE__, __LINE__);                   \
-    printf(__VA_ARGS__);                                         \
-    printf("\n");                                                \
-    fflush(stdout);                                              \
-    if (ncclDebugLevel >= ABORT) abort();                        \
-  }                                                              \
-} while(0)
-
-#define INFO(...) do {                                           \
-  if (ncclDebugLevel >= INFO) {                                  \
-    printf("INFO "); printf(__VA_ARGS__); printf("\n");          \
-    fflush(stdout);                                              \
-  }                                                              \
-} while(0)
-
-// Check CUDA calls
-#define CUDACHECK(cmd, retcode) do {                        \
-    cudaError_t e = cmd;                                    \
-    if( e != cudaSuccess ) {                                \
-        WARN("Cuda failure '%s'\n", cudaGetErrorString(e)); \
-        return retcode;                                     \
-    }                                                       \
-} while(false)
-
-// Propagate errors up
-#define NCCLCHECK(call) do { \
-  ncclResult_t res = call; \
-  if (res != ncclSuccess) { \
-    return res; \
-  } \
-} while (0);
-
-#ifdef PROFAPI
-#define NCCL_API(ret, func, args...)        \
-    __attribute__ ((visibility("default"))) \
-    __attribute__ ((alias(#func)))          \
-    ret p##func (args);                     \
-    extern "C"                              \
-    __attribute__ ((visibility("default"))) \
-    __attribute__ ((weak))                  \
-    ret func(args)
-#else
-#define NCCL_API(ret, func, args...)        \
-    extern "C"                              \
-    __attribute__ ((visibility("default"))) \
-    ret func(args)
-#endif // end PROFAPI
-
-
-#endif // end include guard
-
diff --git a/third_party/nccl/src/enqueue.h b/third_party/nccl/src/enqueue.h
deleted file mode 100644
index 43d570efee..0000000000
--- a/third_party/nccl/src/enqueue.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef enqueue_h_
-#define enqueue_h_
-
-#include "core.h"
-#include "reduce_kernel.h"
-
-/* Syncronize previous collective (if in different stream) and enqueue
- * collective. Work is performed asynchronously with the host thread.
- * The ColFunc class should be templated on the datatype and reduction
- * operator (if applicable) and define a static entry() method as
- * follows.
- *   template <typename T, template <typename> class RedOp>
- *   class CollectiveFunctor {
- *     public:
- *     static ncclResult_t entry(const void* sendbuff, void* recvbuff, int count,
- *         int root, ncclComm* comm, cudaStream_t stream);
- *   };
- * The entry() method can assume that the appropriate cuda device has been set. */
-template< template<typename, template<typename> class> class ColFunc,
-          typename T,
-          template<typename> class Op >
-ncclResult_t enqueue(const void* sendbuff,
-                     void* recvbuff,
-                     int count,
-                     int root,
-                     ncclComm_t comm,
-                     cudaStream_t stream)
-{
-  if (stream != comm->prevStream) { // sync required for calls in different streams
-    comm->prevStream = stream;
-    CUDACHECK(cudaStreamWaitEvent(stream, comm->doneEvent, 0), ncclUnhandledCudaError);
-  }
-
-  ncclResult_t ret;
-  ret = ColFunc<T, Op>::entry(sendbuff, recvbuff, count, root, comm, stream);
-
-  // Always have to record done event because we don't know what stream next
-  // collective will be in.
-  CUDACHECK(cudaEventRecord(comm->doneEvent, stream), ncclUnhandledCudaError);
-  comm->opSched += 1;
-  return ret;
-}
-
-
-// This version decodes type
-template< template<typename, template<typename> class> class ColFunc,
-          template<typename> class Op >
-ncclResult_t enqueue(const void* sendbuff,
-                     void* recvbuff,
-                     int count,
-                     ncclDataType_t type,
-                     int root,
-                     ncclComm_t comm,
-                     cudaStream_t stream)
-{
-  switch(type) {
-  case ncclChar:
-    return enqueue<ColFunc, char, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  case ncclInt:
-    return enqueue<ColFunc, int, Op>(sendbuff, recvbuff, count, root, comm, stream);
-#ifdef CUDA_HAS_HALF
-  case ncclHalf:
-    return enqueue<ColFunc, half, Op>(sendbuff, recvbuff, count, root, comm, stream);
-#endif
-  case ncclFloat:
-    return enqueue<ColFunc, float, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  case ncclDouble:
-    return enqueue<ColFunc, double, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  case ncclInt64:
-    return enqueue<ColFunc, long long, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  case ncclUint64:
-    return enqueue<ColFunc, unsigned long long, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  default:
-    WARN("Invalid ncclType %d", type);
-    return ncclInvalidType;
-  }
-}
-
-// This version decodes both type and reduction op
-template< template<typename, template<typename> class> class ColFunc>
-ncclResult_t enqueue(const void* sendbuff,
-                     void* recvbuff,
-                     int count,
-                     ncclDataType_t type,
-                     ncclRedOp_t op,
-                     int root,
-                     ncclComm_t comm,
-                     cudaStream_t stream)
-{
-  switch(op) {
-  case ncclSum:
-    return enqueue<ColFunc, FuncSum>(sendbuff, recvbuff, count, type, root, comm, stream);
-  case ncclProd:
-    return enqueue<ColFunc, FuncProd>(sendbuff, recvbuff, count, type, root, comm, stream);
-  case ncclMax:
-    return enqueue<ColFunc, FuncMax>(sendbuff, recvbuff, count, type, root, comm, stream);
-  case ncclMin:
-    return enqueue<ColFunc, FuncMin>(sendbuff, recvbuff, count, type, root, comm, stream);
-  default:
-    WARN("Invalid ncclRedOp: %d", op);
-    return ncclInvalidOperation;
-  }
-}
-
-#endif // End include guard
-
diff --git a/third_party/nccl/src/libwrap.cu b/third_party/nccl/src/libwrap.cu
deleted file mode 100644
index 1ac19a6238..0000000000
--- a/third_party/nccl/src/libwrap.cu
+++ /dev/null
@@ -1,155 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "libwrap.h"
-#include <dlfcn.h>
-#include "core.h"
-
-int symbolsLoaded = 0;
-
-static nvmlReturn_t (*nvmlInternalInit)(void);
-static nvmlReturn_t (*nvmlInternalShutdown)(void);
-static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
-static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
-static nvmlReturn_t (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device);
-static nvmlReturn_t (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
-static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
-
-ncclResult_t wrapSymbols(void) {
-
-  if (symbolsLoaded)
-    return ncclSuccess;
-
-  static void* nvmlhandle = NULL;
-  void* tmp;
-  void** cast;
-
-  nvmlhandle=dlopen("libnvidia-ml.so", RTLD_NOW);
-  if (!nvmlhandle) {
-    nvmlhandle=dlopen("libnvidia-ml.so.1", RTLD_NOW);
-    if (!nvmlhandle) {
-      WARN("Failed to open libnvidia-ml.so[.1]");
-      goto teardown;
-    }
-  }
-
-  #define LOAD_SYM(handle, symbol, funcptr) do {         \
-    cast = (void**)&funcptr;                             \
-    tmp = dlsym(handle, symbol);                         \
-    if (tmp == NULL) {                                   \
-      WARN("dlsym failed on %s - %s", symbol, dlerror());\
-      goto teardown;                                     \
-    }                                                    \
-    *cast = tmp;                                         \
-  } while (0)
-
-  LOAD_SYM(nvmlhandle, "nvmlInit", nvmlInternalInit);
-  LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceSetCpuAffinity", nvmlInternalDeviceSetCpuAffinity);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
-  LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
-
-  symbolsLoaded = 1;
-  return ncclSuccess;
-
-  teardown:
-  nvmlInternalInit = NULL;
-  nvmlInternalShutdown = NULL;
-  nvmlInternalDeviceGetHandleByPciBusId = NULL;
-  nvmlInternalDeviceGetIndex = NULL;
-  nvmlInternalDeviceSetCpuAffinity = NULL;
-  nvmlInternalDeviceClearCpuAffinity = NULL;
-
-  if (nvmlhandle != NULL) dlclose(nvmlhandle);
-  return ncclSystemError;
-}
-
-
-ncclResult_t wrapNvmlInit(void) {
-  if (nvmlInternalInit == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalInit();
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlInit() failed: %s",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlShutdown(void) {
-  if (nvmlInternalShutdown == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalShutdown();
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlShutdown() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
-  if (nvmlInternalDeviceGetHandleByPciBusId == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
-  if (nvmlInternalDeviceGetIndex == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalDeviceGetIndex(device, index);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetIndex() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
-  if (nvmlInternalDeviceSetCpuAffinity == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalDeviceSetCpuAffinity(device);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceSetCpuAffinity() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
-  if (nvmlInternalInit == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalDeviceClearCpuAffinity(device);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceClearCpuAffinity() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
diff --git a/third_party/nccl/src/libwrap.h b/third_party/nccl/src/libwrap.h
deleted file mode 100644
index cdce480415..0000000000
--- a/third_party/nccl/src/libwrap.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-
-// Dynamically handle dependencies on external libraries (other than cudart).
-
-#ifndef SRC_LIBWRAP_H_
-#define SRC_LIBWRAP_H_
-
-#include "core.h"
-
-/* Extracted from nvml.h */
-typedef struct nvmlDevice_st* nvmlDevice_t;
-
-typedef enum nvmlReturn_enum
-{
-    NVML_SUCCESS = 0,                   //!< The operation was successful
-    NVML_ERROR_UNINITIALIZED = 1,       //!< NVML was not first initialized with nvmlInit()
-    NVML_ERROR_INVALID_ARGUMENT = 2,    //!< A supplied argument is invalid
-    NVML_ERROR_NOT_SUPPORTED = 3,       //!< The requested operation is not available on target device
-    NVML_ERROR_NO_PERMISSION = 4,       //!< The current user does not have permission for operation
-    NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
-    NVML_ERROR_NOT_FOUND = 6,           //!< A query to find an object was unsuccessful
-    NVML_ERROR_INSUFFICIENT_SIZE = 7,   //!< An input argument is not large enough
-    NVML_ERROR_INSUFFICIENT_POWER = 8,  //!< A device's external power cables are not properly attached
-    NVML_ERROR_DRIVER_NOT_LOADED = 9,   //!< NVIDIA driver is not loaded
-    NVML_ERROR_TIMEOUT = 10,            //!< User provided timeout passed
-    NVML_ERROR_IRQ_ISSUE = 11,          //!< NVIDIA Kernel detected an interrupt issue with a GPU
-    NVML_ERROR_LIBRARY_NOT_FOUND = 12,  //!< NVML Shared Library couldn't be found or loaded
-    NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
-    NVML_ERROR_CORRUPTED_INFOROM = 14,  //!< infoROM is corrupted
-    NVML_ERROR_GPU_IS_LOST = 15,        //!< The GPU has fallen off the bus or has otherwise become inaccessible
-    NVML_ERROR_RESET_REQUIRED = 16,     //!< The GPU requires a reset before it can be used again
-    NVML_ERROR_OPERATING_SYSTEM = 17,   //!< The GPU control device has been blocked by the operating system/cgroups
-    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,   //!< RM detects a driver/library version mismatch
-    NVML_ERROR_IN_USE = 19,             //!< An operation cannot be performed because the GPU is currently in use
-    NVML_ERROR_UNKNOWN = 999            //!< An internal driver error occurred
-} nvmlReturn_t;
-/* End of nvml.h */
-
-ncclResult_t wrapSymbols(void);
-
-ncclResult_t wrapNvmlInit(void);
-ncclResult_t wrapNvmlShutdown(void);
-ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
-ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
-ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
-ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
-
-#endif // End include guard
-
diff --git a/third_party/nccl/src/nccl.h b/third_party/nccl/src/nccl.h
deleted file mode 100644
index 7bb5aa52bc..0000000000
--- a/third_party/nccl/src/nccl.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_H_
-#define NCCL_H_
-
-#include <cuda_runtime.h>
-
-#if CUDART_VERSION >= 7050
-#include <cuda_fp16.h>
-#define CUDA_HAS_HALF 1
-#else
-#undef CUDA_HAS_HALF
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Opaque handle to communicator */
-typedef struct ncclComm* ncclComm_t;
-
-#define NCCL_UNIQUE_ID_BYTES 128
-typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
-
-/* Error type */
-typedef enum { ncclSuccess                 =  0,
-               ncclUnhandledCudaError      =  1,
-               ncclSystemError             =  2,
-               ncclInternalError           =  3,
-               ncclInvalidDevicePointer    =  4,
-               ncclInvalidRank             =  5,
-               ncclUnsupportedDeviceCount  =  6,
-               ncclDeviceNotFound          =  7,
-               ncclInvalidDeviceIndex      =  8,
-               ncclLibWrapperNotSet        =  9,
-               ncclCudaMallocFailed        = 10,
-               ncclRankMismatch            = 11,
-               ncclInvalidArgument         = 12,
-               ncclInvalidType             = 13,
-               ncclInvalidOperation        = 14,
-               nccl_NUM_RESULTS            = 15 } ncclResult_t;
-
-/* Generates a unique Id with each call. Used to generate commId for
- * ncclCommInitAll. uniqueId will be created in such a way that it is
- * guaranteed to be unique accross the host. */
-ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
-ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
-
-/* Creates a new communicator (multi process version).
- * rank must be between 0 and ndev-1 and unique within a communicator clique.
- * ndev is number of logical devices
- * The communicator is created on the current CUDA device.
- * ncclCommInitRank implicitly syncronizes with other ranks, so INIT OF EACH RANK MUST
- * BE CALLED IN A SEPARATE HOST THREADS to avoid deadlock. */
-ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
-ncclResult_t pncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
-
-/* Creates a clique of communicators.
- * This is a convenience function to create a single-process communicator clique.
- * Returns an array of ndev newly initialized communicators in comm.
- * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
- * If devlist is NULL, the first ndev CUDA devices are used.
- * Order of devlist defines user-order of processors within the communicator. */
-ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
-ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
-
-/* Frees resources associated with communicator object. */
-void  ncclCommDestroy(ncclComm_t comm);
-void pncclCommDestroy(ncclComm_t comm);
-
-/* Returns nice error message. */
-const char*  ncclGetErrorString(ncclResult_t result);
-const char* pncclGetErrorString(ncclResult_t result);
-
-/* Sets count to number of devices in the communicator clique. */
-ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
-ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
-
-/* Returns cuda device number associated with communicator. */
-ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
-ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
-
-/* Returns user-ordered "rank" assocaiated with communicator. */
-ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
-ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
-
-/* Reduction opperation selector */
-typedef enum { ncclSum        = 0,
-               ncclProd       = 1,
-               ncclMax        = 2,
-               ncclMin        = 3,
-               nccl_NUM_OPS   = 4 } ncclRedOp_t;
-
-/* Data types */
-typedef enum { ncclChar       = 0,
-               ncclInt        = 1,
-#ifdef CUDA_HAS_HALF
-               ncclHalf       = 2,
-#endif
-               ncclFloat      = 3,
-               ncclDouble     = 4,
-               ncclInt64      = 5,
-               ncclUint64     = 6,
-               nccl_NUM_TYPES = 7 } ncclDataType_t;
-
-/* Reduces data arrays of length count in sendbuff into recvbuf using op operation.
- * recvbuf may be NULL on all calls except for root device.
- * On the root device, sendbuff and recvbuff are assumed to reside on
- * the same device.
- * Must be called separately for each communicator in communicator clique.
-*/
-ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-
-/* Reduces data arrays of length count in sendbuff using op operation, and leaves
- * identical copies of result on each GPUs recvbuff.
- * Sendbuff and recvbuff are assumed to reside on the same device.
- * Must be called separately for each communicator in communicator clique. */
-ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
-
-/* Reduces data in sendbuff using op operation and leaves reduced result scattered
- * over the devices so that recvbuff on the i-th GPU will contain the i-th block of
- * the result. Sendbuff and recvbuff are assumed to reside on same device. Assumes
- * sendbuff has size at least ndev*recvcount elements, where ndev is number of
- * communicators in communicator clique
- * Must be called separately for each communicator in communicator clique.*/
-ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
-    int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    cudaStream_t stream);
-ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
-    int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    cudaStream_t stream);
-
-/* Copies count values from root to all other devices.
- * Root specifies the source device in user-order
- * (see ncclCommInit).
- * Must be called separately for each communicator in communicator clique. */
-ncclResult_t  ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-
-
-/* Each device gathers count values from other GPUs.
- * Result is ordered by comm's logical device order.
- * Assumes recvbuff has size at least ndev*count, where ndev is number of communicators
- * in communicator clique.
- * Sendbuff and recvbuff are assumed to reside on same device.
- * Must be called separately for each communicator in communicator clique. */
-ncclResult_t  ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
-    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
-    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
-
-
-/* The following collective operations are not implemented yet */
-///* Gather count values from each device to recvbuff.
-// * Result is ordered by comm's logical device order.
-// * recvbuff may be NULL for all calls except for root device.
-// * On the root device, sendbuff and recvbuff are assumed to reside on the same device.
-// * Must be called separately for each communicator in communicator clique. */
-// * All GPUs, including root, perform copies into recvbuff.
-//ncclResult_t  ncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
-//    void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream);
-//ncclResult_t pncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
-//                        void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream);
-
-///* Root device scatters count values to each devices.
-// * sendbuff may be NULL on all devices except a single root
-// * device where it is assumed to have size at least nGPUs*count.
-// * recvbuff allocated on each gpu, including root, size=count.
-// * Result is ordered by comm's logical device order.
-// * Called separately for each device in the ncclComm. */
-//ncclResult_t  ncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
-//    int count, int root, ncclComm_t comm, cudaStream_t stream);
-//ncclResult_t pncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
-//    int count, int root, ncclComm_t comm, cudaStream_t stream);
-//
-///* All GPUs scatter blocks of count elements to other devices.
-// * Must be called separately for each device in the ncclComm.
-// * sendbuff and recvbuff assumed to reside on same device and
-// * have size at least nGPUs*count.
-// * Called separately for each device in the ncclComm. */
-//ncclResult_t  ncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
-//    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
-//ncclResult_t pncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
-//    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
-
-#ifdef __cplusplus
-} // end extern "C"
-#endif
-
-#endif // end include guard
-
diff --git a/third_party/nccl/src/primitives.h b/third_party/nccl/src/primitives.h
deleted file mode 100644
index bcaeca8f90..0000000000
--- a/third_party/nccl/src/primitives.h
+++ /dev/null
@@ -1,206 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef PRIMITIVES_H_
-#define PRIMITIVES_H_
-
-#include <type_traits>
-#include "copy_kernel.h" // for FuncPassA
-#include "reduce_kernel.h" // for reduction funcs
-
-
-/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy.
- *
- * In order to reduce the reptetion of template arguments, the operations
- * are bundled as static methods of the Primitives class.
- *
- * Each primitive operation copies/reduces a contiguous buffer and syncs
- * an optional set of flags against a sub-step counter. The sync value is
- * based on the step parameter. Sync flags must be of type WaitFlag or
- * PostFlag. The primitive routines wait for all WaitFlag args to attain
- * at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of
- * corresponding substep by previous step) before executing the transfer.
- * After each substep is transfered, all PostFlag arguments get updated to
- * the value SUBSTEPS*step+substep+1.
- */
-
-
-class WaitFlag {
-  volatile int * const flag;
-  const int shift;
-  public:
-  __device__ __forceinline__
-  WaitFlag(volatile int * const flag, const int shift) : flag(flag), shift(shift) { }
-  __device__ __forceinline__
-  void wait(int val) { while (*flag < (val + shift)) /*SPIN*/; }
-};
-
-
-class PostFlag {
-  volatile int * const flag;
-  const int shift;
-  public:
-  __device__ __forceinline__
-  PostFlag(volatile int* const flag, const int shift) : flag(flag), shift(shift) { }
-  __device__ __forceinline__
-  void post(int val) { *flag = (val + shift); }
-};
-
-
-// Helper to check if any argument is of type T.
-// e.g. AnyAre<WaitFlag>(Flag1, Flag2, ...)
-template<typename T> __device__ __forceinline__
-bool AnyAre() { return false; }
-
-template<typename T, typename FIRST_T, typename... TAIL_Ts>
-__device__ __forceinline__
-bool AnyAre(FIRST_T first, TAIL_Ts... tail) {
-  return std::is_same<T, FIRST_T>::value || AnyAre<T>(tail...);
-}
-
-
-// Wait on all WaitFlags, ignore PostFlags
-__device__ __forceinline__
-void WaitOnFlags(int val) { }
-
-template <typename... TAIL_Ts> __device__ __forceinline__
-void WaitOnFlags(int val, WaitFlag flag, TAIL_Ts... tail) {
-  flag.wait(val);
-  WaitOnFlags(val, tail...);
-}
-
-template <typename... TAIL_Ts> __device__ __forceinline__
-void WaitOnFlags(int val, PostFlag, TAIL_Ts... tail) {
-  WaitOnFlags(val, tail...);
-}
-
-
-// Post all PostFlags, ingnore WaitFlags
-__device__ __forceinline__
-void PostToFlags(int val) { }
-
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostToFlags(int val, WaitFlag flag, TAIL_Ts... tail) {
-  PostToFlags(val, tail...);
-}
-
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostToFlags(int val, PostFlag flag, TAIL_Ts... tail) {
-  flag.post(val);
-  PostToFlags(val, tail...);
-}
-
-
-// Create pointer arithmetic syntax that doesn't break for nullptr_t
-template <typename Tptr> __device__ __forceinline__
-Tptr ptradd(Tptr ptr, int i) {
-  return ptr + i;
-}
-
-__device__ __forceinline__
-std::nullptr_t ptradd(std::nullptr_t ptr, int i) {
-  return nullptr;
-}
-
-
-// Implementation of primitive types
-template <int THREADS, int UNROLL, int SUBSTEPS, typename T, typename REDOP=FuncSum<T> >
-class Primitives {
-  private:
-  template <typename SRC2_T, // either T* or nullptr_t
-            typename DST2_T, // either T* or nullptr_t
-            typename... SYNC_Ts> // either WaitFunc or PostFunc
-  static __device__ __forceinline__ void
-  GenericOp(const T*     src1,
-            const SRC2_T src2,
-                  T*     dst1,
-                  DST2_T dst2,
-            int len, int maxoffset, int step, SYNC_Ts... flags) {
-
-    enum { noSrc2 = std::is_same<SRC2_T, std::nullptr_t>::value };
-    enum { noDst2 = std::is_same<DST2_T, std::nullptr_t>::value };
-    static_assert(noSrc2 || std::is_same<SRC2_T, const T*>::value,
-        "src2 must be of type T* or nullptr_t");
-    static_assert(noDst2 || std::is_same<DST2_T, T*>::value,
-        "dst2 must be of type T* or nullptr_t");
-
-    using OpType = typename std::conditional<noSrc2, FuncPassA<T>, REDOP>::type;
-
-    if (threadIdx.x < THREADS) {
-      int sliceSize = len / SUBSTEPS;
-      int sliceOffset = 0;
-      #pragma unroll 1
-      for (int sub=0; sub<SUBSTEPS; ++sub) {
-        if (AnyAre<WaitFlag>(flags...)) {
-          if (threadIdx.x == 0) {
-            WaitOnFlags(SUBSTEPS*step + sub + 1, flags...);
-          }
-          asm volatile ("bar.sync 1, %0;" :: "r"(THREADS));
-        }
-        ReduceOrCopy
-            <
-             UNROLL,
-             THREADS,
-             OpType,
-             T,
-             !std::is_same<DST2_T, std::nullptr_t>::value, // HAS_DEST1
-             !std::is_same<SRC2_T, std::nullptr_t>::value  // HAS_SRC1
-            >
-            (
-             threadIdx.x,
-             ptradd(dst1, sliceOffset),
-             ptradd(dst2, sliceOffset),
-             ptradd(src1, sliceOffset),
-             ptradd(src2, sliceOffset),
-             min(sliceSize, maxoffset-sliceOffset)
-            );
-        if (AnyAre<PostFlag>(flags...)) {
-          __syncthreads();
-        }
-        sliceOffset += sliceSize;
-      }
-    } else {
-      for(int sub=0; sub<SUBSTEPS; ++sub) {
-        if (AnyAre<PostFlag>(flags...)) {
-          __syncthreads();
-          __threadfence_system();
-          PostToFlags(SUBSTEPS*step + sub + 1, flags...);
-        }
-      }
-    }
-  }
-
-  public:
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  Copy(const T* src, T* dst,
-      int len, int maxOffset, int step, SYNC_Ts... flags) {
-    GenericOp(src, nullptr, dst, nullptr, len, maxOffset, step, flags...);
-  }
-
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  DoubleCopy(const T* src, T* dst1, T* dst2,
-      int len, int maxOffset, int step, SYNC_Ts... flags) {
-    GenericOp(src, nullptr, dst1, dst2, len, maxOffset, step, flags...);
-  }
-
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  Reduce(const T* src1, const T* src2, T* dst,
-      int len, int maxOffset, int step, SYNC_Ts... flags) {
-    GenericOp(src1, src2, dst, nullptr, len, maxOffset, step, flags...);
-  }
-
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  ReduceCopy(const T* src1, const T* src2, T* dst1, T* dst2,
-      int len, int maxOffset, int step, SYNC_Ts... flags) {
-    GenericOp(src1, src2, dst1, dst2, len, maxOffset, step, flags...);
-  }
-};
-
-#endif // end include guard
diff --git a/third_party/nccl/src/reduce.cu b/third_party/nccl/src/reduce.cu
deleted file mode 100644
index 721518373f..0000000000
--- a/third_party/nccl/src/reduce.cu
+++ /dev/null
@@ -1,148 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 2
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  boffset += sliceSize; \
-  if (boffset == buffSize) boffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void ReduceKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ DevRing<T> ring;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, (1-NUM_BUFCHUNKS)*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, 0);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, 0);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T, FUNC> Prims;
-
-  const int size = args.N;
-  const int nranks = args.nRanks;
-  const int rank = ring.userRank[0];
-  const int prevRank = ring.userRank[nranks-1];
-  const int root = args.root;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int boffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int offset = 0; offset < size; offset += sliceSize) {
-    int maxOffset = size-offset;
-    if (prevRank == root) {
-      Prims::Copy(
-          thisInput + offset,
-          nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
-    } else if (rank == root) {
-      Prims::Reduce(
-          thisInput + offset,
-          prevInput + boffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      Prims::Reduce(
-          thisInput + offset,
-          prevInput + boffset,
-          nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-    }
-    NEXT_STEP; // Increases step, boffset
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    if (rank != root) {
-      // Wait for last update from next then reset the flag
-      waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-      *ring.recvFlagFromNext = 0;
-    }
-
-    if (prevRank != root) {
-      // reset the flag
-      *ring.recvFlagFromPrev = 0;
-    }
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 512
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingReduce(const void* sendbuff, void* recvbuff, const int count, const int root,
-    ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
-  } else {
-    KernelArgs<T> args;
-    ArgsSetup(&args, sendbuff, recvbuff, root, count, comm);
-    LAUNCH_KERNEL(ReduceKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template<typename> class RedOp>
-class ReduceFunctor {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int root, ncclComm* comm, cudaStream_t stream) {
-    return RingReduce<RedOp<T>, T>(sendbuff, recvbuff, count, root, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, datatype, op, root, comm, "Reduce"));
-  return enqueue<ReduceFunctor>(sendbuff, recvbuff, count, datatype, op, root, comm, stream);
-}
-
diff --git a/third_party/nccl/src/reduce_kernel.h b/third_party/nccl/src/reduce_kernel.h
deleted file mode 100644
index f2cd512f5c..0000000000
--- a/third_party/nccl/src/reduce_kernel.h
+++ /dev/null
@@ -1,309 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-
-#ifndef REDUCE_KERNEL_H_
-#define REDUCE_KERNEL_H_
-
-#include "common_kernel.h"
-#include <limits>
-
-template<typename T>
-struct FuncNull {
-  __device__ T operator()(const T x, const T y) const {
-    return 0;
-  }
-};
-
-template<typename T>
-struct FuncSum {
-  __device__ T operator()(const T x, const T y) const {
-    return x + y;
-  }
-};
-
-template<typename T>
-struct FuncProd {
-  __device__ T operator()(const T x, const T y) const {
-    return x * y;
-  }
-};
-
-template<typename T>
-struct FuncMax {
-  __device__ T operator()(const T x, const T y) const {
-    return (x < y) ? y : x;
-  }
-};
-
-template<typename T>
-struct FuncMin {
-  __device__ T operator()(const T x, const T y) const {
-    return (x < y) ? x : y;
-  }
-};
-
-template<>
-struct FuncSum<char> {
-  union converter {
-    uint32_t storage;
-    char4 a;
-  };
-  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
-    int32_t rv, z=0;
-    asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
-    return rv;
-#elif (__CUDA_ARCH__ >= 500)
-    int32_t rv;
-    asm("vadd.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-        "vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
-#else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = cx.a.x + cy.a.x;
-    cr.a.y = cx.a.y + cy.a.y;
-    cr.a.z = cx.a.z + cy.a.z;
-    cr.a.w = cx.a.w + cy.a.w;
-    return cr.storage;
-#endif
-  }
-  __device__ char operator()(const char x, const char y) const {
-    return x+y;
-  }
-};
-
-template<>
-struct FuncProd<char> {
-  union converter { uint32_t storage; char4 a; };
-  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if (__CUDA_ARCH__ >= 300)
-    int32_t rv, zero=0;
-    asm("{ .reg .u32 t0, t1, t2, t3;\n\t"
-        " vmad.u32.u32.u32 t3, %1.b3, %2.b3, %3;\n\t"
-        " vmad.u32.u32.u32 t2, %1.b2, %2.b2, %3;\n\t"
-        " shl.b32          t3, t3, 16;\n\t"
-        " shl.b32          t2, t2, 16;\n\t"
-        " vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t"
-        " shl.b32          t1, t1, 8;\n\t"
-        " vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t"
-        " and.b32          t1, t1, 0xff00ff00;\n\t"
-        " and.b32          t0, t0, 0x00ff00ff;\n\t"
-        " or.b32           %0,  t0, t1;\n\t"
-        "}" : "=r"(rv) : "r"(x), "r"(y), "r"(zero));
-    return rv;
-#else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = cx.a.x * cy.a.x;
-    cr.a.y = cx.a.y * cy.a.y;
-    cr.a.z = cx.a.z * cy.a.z;
-    cr.a.w = cx.a.w * cy.a.w;
-    return cr.storage;
-#endif
-  }
-  __device__ char operator()(const char x, const char y) const {
-    return x*y;
-  }
-};
-
-template<>
-struct FuncMax<char> {
-  union converter { uint32_t storage; char4 a; };
-  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
-    int32_t rv, z=0;
-    if (std::numeric_limits<char>::is_signed)
-      asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
-    else
-      asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
-    return rv;
-#elif (__CUDA_ARCH__ >= 500)
-    int32_t rv;
-    if (std::numeric_limits<char>::is_signed)
-      asm("vmax.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-          "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-          "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-          "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    else
-      asm("vmax.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-          "vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-          "vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-          "vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
-#else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = max(cx.a.x, cy.a.x);
-    cr.a.y = max(cx.a.y, cy.a.y);
-    cr.a.z = max(cx.a.z, cy.a.z);
-    cr.a.w = max(cx.a.w, cy.a.w);
-    return cr.storage;
-#endif
-  }
-  __device__ char operator()(const char x, const char y) const {
-    return (x>y) ? x : y;
-  }
-};
-
-template<>
-struct FuncMin<char> {
-  union converter { uint32_t storage; char4 a; };
-  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
-    int32_t rv, z=0;
-    if (std::numeric_limits<char>::is_signed)
-      asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
-    else
-      asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
-    return rv;
-#elif (__CUDA_ARCH__ >= 500)
-    int32_t rv;
-    if (std::numeric_limits<char>::is_signed)
-      asm("vmin.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-          "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-          "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-          "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    else
-      asm("vmin.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-          "vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-          "vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-          "vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
-#else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = min(cx.a.x, cy.a.x);
-    cr.a.y = min(cx.a.y, cy.a.y);
-    cr.a.z = min(cx.a.z, cy.a.z);
-    cr.a.w = min(cx.a.w, cy.a.w);
-    return cr.storage;
-#endif
-  }
-  __device__ char operator()(const char x, const char y) const {
-    return (x<y) ? x : y;
-  }
-};
-
-#ifdef CUDA_HAS_HALF
-template<>
-struct FuncSum<half> {
-  __device__ half2 operator()(const half2 x, const half2 y) const {
-#if __CUDA_ARCH__ >= 530
-    return __hadd2(x, y);
-#else
-    float2 fx, fy, fr;
-    fx = __half22float2(x);
-    fy = __half22float2(y);
-    fr.x = fx.x + fy.x;
-    fr.y = fx.y + fy.y;
-    return __float22half2_rn(fr);
-#endif
-  }
-  __device__ half operator()(const half x, const half y) const {
-#if __CUDA_ARCH__ >= 530
-    return __hadd(x, y);
-#else
-    return __float2half( __half2float(x) + __half2float(y) );
-#endif
-  }
-};
-
-template<>
-struct FuncProd<half> {
-  __device__ half2 operator()(const half2 x, const half2 y) const {
-#if __CUDA_ARCH__ >= 530
-    return __hmul2(x, y);
-#else
-    float2 fx, fy, fr;
-    fx = __half22float2(x);
-    fy = __half22float2(y);
-    fr.x = fx.x * fy.x;
-    fr.y = fx.y * fy.y;
-    return __float22half2_rn(fr);
-#endif
-  }
-  __device__ half operator()(const half x, const half y) const {
-#if __CUDA_ARCH__ >= 530
-    return __hmul(x, y);
-#else
-    return __float2half( __half2float(x) * __half2float(y) );
-#endif
-  }
-};
-
-template<>
-struct FuncMax<half> {
-  __device__ half2 operator()(const half2 x, const half2 y) const {
-    float2 fx, fy, fr;
-    fx = __half22float2(x);
-    fy = __half22float2(y);
-    fr.x = fmaxf(fx.x, fy.x);
-    fr.y = fmaxf(fx.y, fy.y);
-    return __float22half2_rn(fr);
-  }
-  __device__ half operator()(const half x, const half y) const {
-    float fx, fy, fm;
-    fx = __half2float(x);
-    fy = __half2float(y);
-    fm = fmaxf(fx, fy);
-    return __float2half(fm);
-  }
-};
-
-template<>
-struct FuncMin<half> {
-  __device__ half2 operator()(const half2 x, const half2 y) const {
-    float2 fx, fy, fr;
-    fx = __half22float2(x);
-    fy = __half22float2(y);
-    fr.x = fminf(fx.x, fy.x);
-    fr.y = fminf(fx.y, fy.y);
-    return __float22half2_rn(fr);
-  }
-  __device__ half operator()(const half x, const half y) const {
-    float fx, fy, fm;
-    fx = __half2float(x);
-    fy = __half2float(y);
-    fm = fminf(fx, fy);
-    return __float2half(fm);
-  }
-};
-#endif
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of threads in the CTA
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, class FUNC, typename T>
-__device__ void Reduce(volatile T * __restrict__ const dest,
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1, const int N) {
-  ReduceOrCopy<UNROLL, THREADS, FUNC, T, false, true>(threadIdx.x, dest,
-      nullptr, src0, src1, N);
-}
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of threads in the CTA
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, class FUNC, typename T>
-__device__ void ReduceAndCopy(volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1,
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1, const int N) {
-  ReduceOrCopy<UNROLL, THREADS, FUNC, T, true, true>(threadIdx.x, dest0, dest1,
-      src0, src1, N);
-}
-
-#endif // REDUCE_KERNEL_H_
diff --git a/third_party/nccl/src/reduce_scatter.cu b/third_party/nccl/src/reduce_scatter.cu
deleted file mode 100644
index b1100dd0c0..0000000000
--- a/third_party/nccl/src/reduce_scatter.cu
+++ /dev/null
@@ -1,165 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 2
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void ReduceScatterKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ DevRing<T> ring;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T, FUNC> Prims;
-
-  const int size = args.N;
-  const int nranks = args.nRanks;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int poffset, noffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int chunkOffset = 0; chunkOffset < size; chunkOffset += sliceSize) {
-    /////////////// begin ReduceScatter steps ///////////////
-    int offset;
-    int maxOffset = size-chunkOffset;
-    int rankDest;
-
-    // step 0: push data to next GPU
-    rankDest = ring.userRank[nranks-1];
-    offset = chunkOffset + rankDest * size;
-
-    Prims::Copy(
-        thisInput  + offset,
-        nextOutput + noffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP; // Increases step, poffset, noffset
-
-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      rankDest = ring.userRank[nranks-j];
-      offset = chunkOffset + rankDest * size;
-
-      Prims::Reduce(
-          prevInput  + poffset,
-          thisInput  + offset,
-          nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-
-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data and push to the next GPU
-    rankDest = ring.userRank[0];
-    offset = chunkOffset + rankDest * size;
-
-    Prims::Reduce(
-        prevInput  + poffset,
-        thisInput  + offset,
-        thisOutput + chunkOffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP;
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    // Wait for last update from next then reset the flag
-    waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-    *ring.recvFlagFromNext = 0;
-
-    // Wait for last update from prev then reset the flag
-    waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
-    *ring.recvFlagFromPrev = 0;
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 512
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingReduceScatter(const void* sendbuff, void* recvbuff,
-    const int count, ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
-  } else {
-    KernelArgs<T> args;
-    ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
-    LAUNCH_KERNEL(ReduceScatterKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template <typename> class RedOp>
-class ReduceScatter {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
-    return RingReduceScatter<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, int recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
-ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, int recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, recvcount, datatype, op, 0, comm, "ReduceScatter"));
-  return enqueue<ReduceScatter>(sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream);
-}
-
diff --git a/third_party/nccl/test/include/test_utilities.h b/third_party/nccl/test/include/test_utilities.h
deleted file mode 100644
index c194205b4c..0000000000
--- a/third_party/nccl/test/include/test_utilities.h
+++ /dev/null
@@ -1,438 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-
-#ifndef SRC_TEST_UTILITIES_H_
-#define SRC_TEST_UTILITIES_H_
-
-#include <curand.h>
-#include <cerrno>
-#include <string>
-
-#define CUDACHECK(cmd) do {                         \
-  cudaError_t e = cmd;                              \
-  if( e != cudaSuccess ) {                          \
-    printf("Cuda failure %s:%d '%s'\n",             \
-        __FILE__,__LINE__,cudaGetErrorString(e));   \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-#define NCCLCHECK(cmd) do {                         \
-  ncclResult_t r = cmd;                             \
-  if (r!= ncclSuccess) {                            \
-    printf("NCCL failure %s:%d '%s'\n",             \
-        __FILE__,__LINE__,ncclGetErrorString(r));   \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-template<typename T>
-void Randomize(T* const dest, const int N, const int randomSeed);
-
-template<typename T>
-void Accumulate(T* dest, const T* contrib, int N, ncclRedOp_t op);
-
-template<typename T>
-double CheckDelta(const T* results, const T* expected, int N);
-
-#define CURAND_CHK(cmd)                                                         \
-    do {                                                                        \
-      curandStatus_t error = (cmd);                                             \
-      if (error != CURAND_STATUS_SUCCESS) {                                     \
-        printf("CuRAND error %i at %s:%i\n", error, __FILE__ , __LINE__);       \
-        exit(EXIT_FAILURE);                                                     \
-      }                                                                         \
-    } while (false)
-
-
-template<typename T>
-void GenerateRandom(curandGenerator_t generator, T * const dest,
-    const int N);
-
-template<>
-void GenerateRandom<char>(curandGenerator_t generator, char * const dest,
-    const int N) {
-  CURAND_CHK(curandGenerate(generator, (unsigned int*)dest,
-      N * sizeof(char) / sizeof(int)));
-}
-
-template<>
-void GenerateRandom<int>(curandGenerator_t generator, int * const dest,
-    const int N) {
-  CURAND_CHK(curandGenerate(generator, (unsigned int*)dest, N));
-}
-
-template<>
-void GenerateRandom<float>(curandGenerator_t generator, float * const dest,
-    const int N) {
-  CURAND_CHK(curandGenerateUniform(generator, dest, N));
-}
-
-template<>
-void GenerateRandom<double>(curandGenerator_t generator, double * const dest,
-    const int N) {
-  CURAND_CHK(curandGenerateUniformDouble(generator, dest, N));
-}
-
-template<>
-void GenerateRandom<unsigned long long>(curandGenerator_t generator, unsigned long long * const dest,
-    const int N) {
-  CURAND_CHK(curandGenerateLongLong(generator, dest, N));
-}
-
-
-template<typename T>
-void Randomize(T* const dest, const int N, const int randomSeed) {
-  curandGenerator_t gen;
-  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32));
-  CURAND_CHK(curandSetPseudoRandomGeneratorSeed(gen, randomSeed));
-  GenerateRandom<T>(gen, dest, N);
-  CURAND_CHK(curandDestroyGenerator(gen));
-  CUDACHECK(cudaDeviceSynchronize());
-}
-
-template<>
-void Randomize(unsigned long long* const dest, const int N, const int randomSeed) {
-  curandGenerator_t gen;
-  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_QUASI_SOBOL64));
-  GenerateRandom<unsigned long long>(gen, dest, N);
-  CURAND_CHK(curandDestroyGenerator(gen));
-  CUDACHECK(cudaDeviceSynchronize());
-}
-
-template<>
-void Randomize(long long* const dest, const int N, const int randomSeed) {
-  curandGenerator_t gen;
-  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_QUASI_SOBOL64));
-  GenerateRandom<unsigned long long>(gen, (unsigned long long *)dest, N);
-  CURAND_CHK(curandDestroyGenerator(gen));
-  CUDACHECK(cudaDeviceSynchronize());
-}
-
-#ifdef CUDA_HAS_HALF
-__global__ void halve(const float * src, half* dest, int N) {
-  for(int tid = threadIdx.x + blockIdx.x*blockDim.x;
-      tid < N; tid += blockDim.x * gridDim.x)
-    dest[tid] = __float2half(src[tid]);
-}
-
-template<>
-void Randomize<half>(half* const dest, const int N, const int randomSeed) {
-  curandGenerator_t gen;
-  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32));
-  CURAND_CHK(curandSetPseudoRandomGeneratorSeed(gen, randomSeed));
-
-  float* temp;
-  CUDACHECK(cudaMalloc(&temp, N*sizeof(float)));
-  GenerateRandom<float>(gen, temp, N);
-  halve<<<128, 512>>>(temp, dest, N);
-  CURAND_CHK(curandDestroyGenerator(gen));
-  CUDACHECK(cudaFree(temp));
-  CUDACHECK(cudaDeviceSynchronize());
-}
-#endif
-
-void makeRandom(void* ptr, int n, ncclDataType_t type, int seed) {
-  if (type == ncclChar)
-    Randomize<char>((char*)ptr, n, seed);
-  else if (type == ncclInt)
-    Randomize<int>((int*)ptr, n, seed);
-#ifdef CUDA_HAS_HALF
-  else if (type == ncclHalf)
-    Randomize<half>((half*)ptr, n, seed);
-#endif
-  else if (type == ncclFloat)
-    Randomize<float>((float*)ptr, n, seed);
-  else if (type == ncclDouble)
-    Randomize<double>((double*)ptr, n, seed);
-  else if (type == ncclInt64)
-    Randomize<long long>((long long*)ptr, n, seed);
-  else if (type == ncclUint64)
-    Randomize<unsigned long long>((unsigned long long*)ptr, n, seed);
-
-  return;
-}
-
-template<typename T, int OP> __global__ static
-void accumKern(T* acum, const T* contrib, int N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    T c = contrib[i];
-    T a = acum[i];
-    if(OP == ncclSum) {
-      acum[i] = a+c;
-    } else if(OP == ncclProd) {
-      acum[i] = a*c;
-    } else if(OP == ncclMax) {
-      acum[i] = (a > c) ? a : c;
-    } else if(OP == ncclMin) {
-      acum[i] = (a < c) ? a : c;
-    }
-  }
-}
-
-#ifdef CUDA_HAS_HALF
-template<> __global__
-void accumKern<half, ncclSum>(half* acum, const half* contrib, int N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    float c = __half2float(contrib[i]);
-    float a = __half2float(acum[i]);
-    acum[i] = __float2half( a + c );
-  }
-}
-
-template<> __global__
-void accumKern<half, ncclProd>(half* acum, const half* contrib, int N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    float c = __half2float(contrib[i]);
-    float a = __half2float(acum[i]);
-    acum[i] = __float2half( a * c );
-  }
-}
-
-template<> __global__
-void accumKern<half, ncclMax>(half* acum, const half* contrib, int N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    float c = __half2float(contrib[i]);
-    float a = __half2float(acum[i]);
-    acum[i] = __float2half( (a>c) ? a : c );
-  }
-}
-
-template<> __global__
-void accumKern<half, ncclMin>(half* acum, const half* contrib, int N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    float c = __half2float(contrib[i]);
-    float a = __half2float(acum[i]);
-    acum[i] = __float2half( (a<c) ? a : c );
-  }
-}
-#endif
-
-template<typename T>
-void accVecType(void* out, void* in, int n, ncclRedOp_t op) {
-  switch(op) {
-    case ncclSum:  accumKern<T, ncclSum> <<<256,256>>>((T*)out, (T*)in, n); break;
-    case ncclProd: accumKern<T, ncclProd><<<256,256>>>((T*)out, (T*)in, n); break;
-    case ncclMax:  accumKern<T, ncclMax> <<<256,256>>>((T*)out, (T*)in, n); break;
-    case ncclMin:  accumKern<T, ncclMin> <<<256,256>>>((T*)out, (T*)in, n); break;
-    default:
-      printf("Unknown reduction operation.\n");
-      exit(EXIT_FAILURE);
-  }
-}
-
-template<typename T>
-void Accumulate(T* dest, const T* contrib, int N, ncclRedOp_t op) {
-
-  T* devdest;
-  CUDACHECK(cudaHostRegister(dest, N*sizeof(T), 0));
-  CUDACHECK(cudaHostGetDevicePointer(&devdest, dest, 0));
-  accVecType<T>((void*)devdest, (void*)contrib, N, op);
-  CUDACHECK(cudaHostUnregister(dest));
-}
-
-void accVec(void* out, void* in, int n, ncclDataType_t type, ncclRedOp_t op) {
-  switch (type) {
-    case ncclChar:   accVecType<char>               (out, in, n, op); break;
-    case ncclInt:    accVecType<int>                (out, in, n, op); break;
-#ifdef CUDA_HAS_HALF
-    case ncclHalf:   accVecType<half>               (out, in, n, op); break;
-#endif
-    case ncclFloat:  accVecType<float>              (out, in, n, op); break;
-    case ncclDouble: accVecType<double>             (out, in, n, op); break;
-    case ncclInt64:  accVecType<long long>          (out, in, n, op); break;
-    case ncclUint64: accVecType<unsigned long long> (out, in, n, op); break;
-    default:
-      printf("Unknown reduction type.\n");
-      exit(EXIT_FAILURE);
-  }
-}
-
-template<typename T> __device__
-double absDiff(T a, T b) {
-  return fabs((double)(b - a));
-}
-
-#ifdef CUDA_HAS_HALF
-template<> __device__
-double absDiff<half>(half a, half b) {
-  float x = __half2float(a);
-  float y = __half2float(b);
-  return fabs((double)(y-x));
-}
-#endif
-
-template<typename T, int BSIZE> __global__
-void deltaKern(const T* A, const T* B, int N, double* max) {
-  __shared__ double temp[BSIZE];
-  int tid = threadIdx.x;
-  double locmax = 0.0;
-  for(int i=tid; i<N; i+=blockDim.x) {
-
-    double delta = absDiff(A[i], B[i]);
-    if( delta > locmax )
-      locmax = delta;
-  }
-
-  temp[tid] = locmax;
-  for(int stride = BSIZE/2; stride > 1; stride>>=1) {
-    __syncthreads();
-    if( tid < stride )
-      temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride];
-  }
-  __syncthreads();
-  if( threadIdx.x == 0)
-    *max = temp[0] > temp[1] ? temp[0] : temp[1];
-}
-
-template<typename T>
-double CheckDelta(const T* results, const T* expected, int N) {
-  T* devexp;
-  double maxerr;
-  double* devmax;
-  CUDACHECK(cudaHostRegister((void*)expected, N*sizeof(T), 0));
-  CUDACHECK(cudaHostGetDevicePointer((void**)&devexp, (void*)expected, 0));
-  CUDACHECK(cudaHostRegister((void*)&maxerr, sizeof(double), 0));
-  CUDACHECK(cudaHostGetDevicePointer(&devmax, &maxerr, 0));
-  deltaKern<T, 512><<<1, 512>>>(results, devexp, N, devmax);
-  CUDACHECK(cudaHostUnregister(&maxerr));
-  CUDACHECK(cudaHostUnregister((void*)devexp));
-  return maxerr;
-}
-
-void maxDiff(double* max, void* first, void* second, int n, ncclDataType_t type, cudaStream_t s) {
-  switch (type) {
-    case ncclChar:   deltaKern<char, 512>              <<<1,512,0,s>>>((char*)first, (char*)second, n, max); break;
-    case ncclInt:    deltaKern<int, 512>               <<<1,512,0,s>>>((int*)first, (int*)second, n, max); break;
-#ifdef CUDA_HAS_HALF
-    case ncclHalf:   deltaKern<half, 512>              <<<1,512,0,s>>>((half*)first, (half*)second, n, max); break;
-#endif
-    case ncclFloat:  deltaKern<float, 512>             <<<1,512,0,s>>>((float*)first, (float*)second, n, max); break;
-    case ncclDouble: deltaKern<double, 512>            <<<1,512,0,s>>>((double*)first, (double*)second, n, max); break;
-    case ncclInt64:  deltaKern<long long, 512>         <<<1,512,0,s>>>((long long*)first, (long long*)second, n, max); break;
-    case ncclUint64: deltaKern<unsigned long long, 512><<<1,512,0,s>>>((unsigned long long*)first, (unsigned long long*)second, n, max); break;
-    default:
-      printf("Unknown reduction type.\n");
-      exit(EXIT_FAILURE);
-  }
-}
-
-std::string TypeName(const ncclDataType_t type) {
-  switch (type) {
-    case ncclChar:   return "char";
-    case ncclInt:    return "int";
-#ifdef CUDA_HAS_HALF
-    case ncclHalf:   return "half";
-#endif
-    case ncclFloat:  return "float";
-    case ncclDouble: return "double";
-    case ncclInt64:  return "int64";
-    case ncclUint64: return "uint64";
-    default:         return "unknown";
-  }
-}
-
-std::string OperationName(const ncclRedOp_t op) {
-  switch (op) {
-    case ncclSum:  return "sum";
-    case ncclProd: return "prod";
-    case ncclMax:  return "max";
-    case ncclMin:  return "min";
-    default:       return "unknown";
-  }
-}
-
-ncclDataType_t strToType(const char* s) {
-  if (strcmp(s, "char") == 0)
-    return ncclChar;
-  if (strcmp(s, "int") == 0)
-    return ncclInt;
-#ifdef CUDA_HAS_HALF
-  if (strcmp(s, "half") == 0)
-    return ncclHalf;
-#endif
-  if (strcmp(s, "float") == 0)
-    return ncclFloat;
-  if (strcmp(s, "double") == 0)
-    return ncclDouble;
-  if (strcmp(s, "int64") == 0)
-    return ncclInt64;
-  if (strcmp(s, "uint64") == 0)
-    return ncclUint64;
-
-  return nccl_NUM_TYPES;
-}
-
-size_t wordSize(ncclDataType_t type) {
-  switch(type) {
-    case ncclChar:   return sizeof(char);
-    case ncclInt:    return sizeof(int);
-#ifdef CUDA_HAS_HALF
-    case ncclHalf:   return sizeof(short);
-#endif
-    case ncclFloat:  return sizeof(float);
-    case ncclDouble: return sizeof(double);
-    case ncclInt64:  return sizeof(long long);
-    case ncclUint64: return sizeof(unsigned long long);
-  }
-
-  return 0;
-}
-
-double deltaMaxValue(ncclDataType_t type, bool is_reduction) {
-  if (is_reduction) {
-    switch(type) {
-#ifdef CUDA_HAS_HALF
-      case ncclHalf:   return 5e-2;
-#endif
-      case ncclFloat:  return 1e-5;
-      case ncclDouble: return 1e-12;
-    }
-  }
-  return 1e-200;
-}
-
-ncclRedOp_t strToOp(const char* s) {
-  if (strcmp(s, "sum") == 0)
-    return ncclSum;
-  if (strcmp(s, "prod") == 0)
-    return ncclProd;
-  if (strcmp(s, "max") == 0)
-    return ncclMax;
-  if (strcmp(s, "min") == 0)
-    return ncclMin;
-
-  return nccl_NUM_OPS;
-}
-
-int strToPosInt(const char* s) {
-  errno = 0;
-  long temp = strtol(s, NULL, 10);
-  if (errno != 0 || temp > INT_MAX || temp < 0)
-    return 0;
-  return (int)temp;
-}
-
-int strToNonNeg(const char* s) {
-  errno = 0;
-  long temp = strtol(s, NULL, 10);
-  if (errno != 0 || temp > INT_MAX || temp < 0)
-    return -1;
-  return (int)temp;
-}
-
-#endif // SRC_TEST_UTILITIES_H_
diff --git a/third_party/nccl/test/mpi/mpi_test.cu b/third_party/nccl/test/mpi/mpi_test.cu
deleted file mode 100644
index fea6ae599c..0000000000
--- a/third_party/nccl/test/mpi/mpi_test.cu
+++ /dev/null
@@ -1,93 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <sys/types.h>
-#include <unistd.h>
-#include <stdio.h>
-
-#include "nccl.h"
-#include "mpi.h"
-#include "test_utilities.h"
-
-#define SIZE 128
-#define NITERS 1
-
-int main(int argc, char *argv[]) {
-  ncclUniqueId commId;
-  int size, rank;
-  ncclResult_t ret;
-
-  MPI_Init(&argc, &argv);
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
-  if (argc < size) {
-    if (rank == 0)
-      printf("Usage : %s <GPU list per rank>\n", argv[0]);
-    exit(1);
-  }
-
-  int gpu = atoi(argv[rank+1]);
-
-  // We have to set our device before NCCL init
-  CUDACHECK(cudaSetDevice(gpu));
-  MPI_Barrier(MPI_COMM_WORLD);
-
-  // NCCL Communicator creation
-  ncclComm_t comm;
-  NCCLCHECK(ncclGetUniqueId(&commId));
-  MPI_Bcast(&commId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, MPI_COMM_WORLD);
-  ret = ncclCommInitRank(&comm, size, commId, rank);
-  if (ret != ncclSuccess) {
-    printf("NCCL Init failed (%d) '%s'\n", ret, ncclGetErrorString(ret));
-    exit(1);
-  }
-
-  // CUDA stream creation
-  cudaStream_t stream;
-  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-
-  // Initialize input values
-  int *dptr;
-  CUDACHECK(cudaMalloc(&dptr, SIZE*2*sizeof(int)));
-  int *val = (int*) malloc(SIZE*sizeof(int));
-  for (int v=0; v<SIZE; v++) {
-    val[v] = rank + 1;
-  }
-  CUDACHECK(cudaMemcpy(dptr, val, SIZE*sizeof(int), cudaMemcpyHostToDevice));
-
-  // Compute final value
-  int ref = size*(size+1)/2;
-
-  // Run allreduce
-  int errors = 0;
-  for (int i=0; i<NITERS; i++) {
-    NCCLCHECK(ncclAllReduce((const void*)dptr, (void*)(dptr+SIZE), SIZE, ncclInt, ncclSum, comm, stream));
-  }
-
-  // Check results
-  CUDACHECK(cudaStreamSynchronize(stream));
-  CUDACHECK(cudaMemcpy(val, (dptr+SIZE), SIZE*sizeof(int), cudaMemcpyDeviceToHost));
-  for (int v=0; v<SIZE; v++) {
-    if (val[v] != ref) {
-      errors++;
-      printf("[%d] Error at %d : got %d instead of %d\n", rank, v, val[v], ref);
-    }
-  }
-  CUDACHECK(cudaFree(dptr));
-
-  MPI_Allreduce(MPI_IN_PLACE, &errors, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD);
-  if (rank == 0) {
-    if (errors)
-      printf("%d errors. Test FAILED.\n", errors);
-    else
-      printf("Test PASSED.\n");
-  }
-
-  MPI_Finalize();
-  ncclCommDestroy(comm);
-  return errors ? 1 : 0;
-}
diff --git a/third_party/nccl/test/single/all_gather_scan.cu b/third_party/nccl/test/single/all_gather_scan.cu
deleted file mode 100644
index becf315b57..0000000000
--- a/third_party/nccl/test/single/all_gather_scan.cu
+++ /dev/null
@@ -1,239 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <float.h>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-void showUsage(const char* bin) {
-  printf("\n"
-         "Usage: %s <type> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
-         "Where:\n"
-#ifdef CUDA_HAS_HALF
-         "    type   =   [char|int|half|float|double|int64|uint64]\n"
-#else
-         "    type   =   [char|int|float|double|int64|uint64]\n"
-#endif
-         "    n_min  >   0\n"
-         "    n_max  >=  n_min\n"
-         "    delta  >   0\n\n", bin);
-  return;
-}
-
-int main(int argc, char* argv[]) {
-  int nvis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nvis));
-  if (nvis == 0) {
-    printf("No GPUs found\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  ncclDataType_t type;
-  int n_min;
-  int n_max;
-  int delta;
-  int gpus;
-  int* list = NULL;
-
-  if (argc < 4) {
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  type = strToType(argv[1]);
-  if (type == nccl_NUM_TYPES) {
-    printf("Invalid <type> '%s'\n", argv[1]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_min = strToPosInt(argv[2]);
-  if (n_min < 1) {
-    printf("Invalid <n_min> '%s'\n", argv[2]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_max = strToPosInt(argv[3]);
-  if (n_max < n_min) {
-    printf("Invalid <n_max> '%s'\n", argv[3]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  if (argc > 4) {
-    delta = strToPosInt(argv[4]);
-    if (delta < 1) {
-      printf("Invalid <delta> '%s'\n", argv[4]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
-  }
-
-  if (argc > 5) {
-    gpus = strToPosInt(argv[5]);
-    if (gpus < 1) {
-      printf("Invalid <gpus> '%s'\n", argv[5]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    gpus = nvis;
-  }
-
-  list = (int*)malloc(gpus*sizeof(int));
-
-  if (argc > 6 && argc != 6+gpus) {
-    printf("If given, GPU list must be fully specified.\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    if(argc > 6) {
-      list[g] = strToNonNeg(argv[6+g]);
-      if (list[g] < 0) {
-        printf("Invalid GPU%d '%s'\n", g, argv[6+g]);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      } else if (list[g] >= nvis) {
-        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      }
-    } else {
-      list[g] = g % nvis;
-    }
-  }
-
-  size_t word = wordSize(type);
-  size_t max_input = n_max * word;
-  size_t max_output = max_input * gpus;
-  void* refout;
-  CUDACHECK(cudaMallocHost(&refout, max_output));
-
-  void **input, **output;
-  double** localError;
-  ncclComm_t* comm;
-  cudaStream_t* stream;
-
-  input = (void**)malloc(gpus*sizeof(void*));
-  output = (void**)malloc(gpus*sizeof(void*));
-  localError = (double**)malloc(gpus*sizeof(double*));
-  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
-  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
-
-  for(int g=0; g<gpus; ++g) {
-    char busid[32] = {0};
-    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
-    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
-
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaMalloc(&input[g],  max_input));
-    CUDACHECK(cudaMalloc(&output[g], max_output));
-    CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
-    CUDACHECK(cudaStreamCreate(&stream[g]));
-    makeRandom(input[g], n_max, type, 42+g);
-
-    CUDACHECK(cudaMemcpy((char*)refout+max_input*g, input[g], max_input, cudaMemcpyDeviceToHost));
-  }
-
-  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
-
-  printf("       BYTES ERROR       MSEC     BW\n");
-
-  for(int n=n_min; n<=n_max; n+=delta) {
-    size_t out_bytes = word * n * gpus;
-
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaMemsetAsync(output[g], 0, out_bytes, stream[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      NCCLCHECK(ncclAllGather(input[g], n, type, output[g], comm[g], stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-    auto stop = std::chrono::high_resolution_clock::now();
-    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
-        (stop - start).count() * 1000.0;
-
-    double max_error = 0.0;
-    for(int slice=0; slice<gpus; ++slice) {
-      void* refSlice = (void*)((char*)refout + slice*max_input);
-      for(int g=0; g<gpus; ++g) {
-        CUDACHECK(cudaSetDevice(list[g]));
-        void* mySlice = (void*)((char*)output[g] + slice*n*word);
-        maxDiff(localError[g], mySlice, refSlice, n, type, stream[g]);
-      }
-      for(int g=0; g<gpus; ++g) {
-        CUDACHECK(cudaSetDevice(list[g]));
-        CUDACHECK(cudaStreamSynchronize(stream[g]));
-        max_error = max(max_error, *localError[g]);
-      }
-    }
-
-    double mb = (double)(n*word * (gpus-1)) * 1.e-6;
-    double algbw = mb / ms;
-    printf("%12lu %5.0le %10.3lf %6.2lf\n",
-        n*word, max_error, ms, algbw);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamDestroy(stream[g]));
-    ncclCommDestroy(comm[g]);
-    CUDACHECK(cudaFree(input[g]));
-    CUDACHECK(cudaFree(output[g]));
-    CUDACHECK(cudaFreeHost(localError[g]));
-  }
-
-  free(localError);
-  free(output);
-  free(input);
-  free(comm);
-  free(stream);
-  CUDACHECK(cudaFreeHost(refout));
-  exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/all_gather_test.cu b/third_party/nccl/test/single/all_gather_test.cu
deleted file mode 100644
index 40d2f31fb2..0000000000
--- a/third_party/nccl/test/single/all_gather_test.cu
+++ /dev/null
@@ -1,235 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-#include "nccl.h"
-#include "test_utilities.h"
-
-int errors = 0;
-double avg_bw = 0.0;
-int avg_count = 0;
-bool is_reduction = false;
-
-template<typename T>
-void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
-    ncclComm_t* const comms, const std::vector<int>& dList) {
-  // initialize data
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
-  T* buffer = (T*)malloc(nDev * N * sizeof(T));
-  T* result = (T*)malloc(nDev * N * sizeof(T));
-  memset(buffer, 0, nDev * N * sizeof(T));
-  memset(result, 0, nDev * N * sizeof(T));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamCreate(s+i));
-    CUDACHECK(cudaMemset(recvbuff[i], 0, nDev * N * sizeof(T)));
-    Randomize(sendbuff[i], N, i);
-
-    CUDACHECK(cudaMemcpy(result + i * N, sendbuff[i], N * sizeof(T),
-        cudaMemcpyDeviceToHost));
-  }
-
-  // warm up GPU
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    NCCLCHECK(ncclAllGather((const void*)sendbuff[i], std::min(32 * 1024, N), type,
-        (void*)recvbuff[i], comms[i], s[i]));
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamSynchronize(s[i]));
-  }
-
-  //for (int n = 1; n <= N; n = n << 1)
-  {
-    int n = N;
-    printf("%12i  %12i  %6s", (int)(n * sizeof(T)), n, TypeName(type).c_str());
-
-    auto start = std::chrono::high_resolution_clock::now();
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      NCCLCHECK(ncclAllGather((const void*)sendbuff[i], n, type, (void*)recvbuff[i], comms[i],
-          s[i]));
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count();
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 * (double)(nDev - 1)
-        / elapsedSec;
-    double busbw = algbw;
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(recvbuff[i], result, nDev*N);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf("  %7.3f  %5.2f  %5.2f  %7.0le\n", elapsedSec * 1.0E3, algbw, busbw,
-        maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamDestroy(s[i]));
-  }
-  free(s);
-  free(buffer);
-  free(result);
-}
-
-template<typename T>
-void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
-    const std::vector<int>& dList) {
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  T** sendbuff = (T**)malloc(nDev * sizeof(T*));
-  T** recvbuff = (T**)malloc(nDev * sizeof(T*));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaMalloc(sendbuff + i, N * sizeof(T)));
-    CUDACHECK(cudaMalloc(recvbuff + i, nDev * N * sizeof(T)));
-  }
-
-  RunTest<T>(sendbuff, recvbuff, N, type, comms, dList);
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaFree(sendbuff[i]));
-    CUDACHECK(cudaFree(recvbuff[i]));
-  }
-
-  free(sendbuff);
-  free(recvbuff);
-}
-
-void usage() {
-  printf("Tests nccl AllGather with user supplied arguments.\n"
-      "    Usage: all_reduce_test <data size in bytes> [number of GPUs] "
-      "[GPU 0] [GPU 1] ...\n\n");
-}
-
-int main(int argc, char* argv[]) {
-  int nVis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nVis));
-
-  int N = 0;
-  if (argc > 1) {
-    int t = sscanf(argv[1], "%d", &N);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    printf("Error: must specify at least data size in bytes!\n\n");
-    usage();
-    exit(EXIT_FAILURE);
-  }
-
-  int nDev = nVis;
-  if (argc > 2) {
-    int t = sscanf(argv[2], "%d", &nDev);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  }
-  std::vector<int> dList(nDev);
-  for (int i = 0; i < nDev; ++i)
-    dList[i] = i % nVis;
-
-
-  if (argc > 3) {
-    if (argc - 3 != nDev) {
-      printf("Error: insufficient number of GPUs in list\n\n");
-      usage();
-      exit(EXIT_FAILURE);
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
-      if (t == 0) {
-        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
-        usage();
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
-  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
-
-  printf("# Using devices\n");
-  for (int g=0; g<nDev; ++g) {
-    int cudaDev;
-    int rank;
-    cudaDeviceProp prop;
-    NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
-    NCCLCHECK(ncclCommUserRank(comms[g], &rank));
-    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-    printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
-        prop.pciBusID, prop.name);
-  }
-  printf("\n");
-
-  printf("# %10s  %12s  %6s  %7s  %5s  %5s  %7s\n",
-      "bytes", "N", "type", "time", "algbw", "busbw", "delta");
-
-  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
-  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
-#ifdef CUDA_HAS_HALF
-  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
-#endif
-  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
-  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
-  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
-  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
-
-  printf("\n");
-
-  for(int i=0; i<nDev; ++i)
-    ncclCommDestroy(comms[i]);
-  free(comms);
-
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
-  avg_bw /= avg_count;
-
-  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
-  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
-  printf("\n");
-  if (errors || avg_bw < check_avg_bw)
-    exit(EXIT_FAILURE);
-  else 
-    exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/all_reduce_scan.cu b/third_party/nccl/test/single/all_reduce_scan.cu
deleted file mode 100644
index f93a09986e..0000000000
--- a/third_party/nccl/test/single/all_reduce_scan.cu
+++ /dev/null
@@ -1,247 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <float.h>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-void showUsage(const char* bin) {
-  printf("\n"
-         "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
-         "Where:\n"
-#ifdef CUDA_HAS_HALF
-         "    type   =   [char|int|half|float|double|int64|uint64]\n"
-#else
-         "    type   =   [char|int|float|double|int64|uint64]\n"
-#endif
-         "    op     =   [sum|prod|max|min]\n"
-         "    n_min  >   0\n"
-         "    n_max  >=  n_min\n"
-         "    delta  >   0\n\n", bin);
-  return;
-}
-
-int main(int argc, char* argv[]) {
-  int nvis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nvis));
-  if (nvis == 0) {
-    printf("No GPUs found\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  ncclDataType_t type;
-  ncclRedOp_t op;
-  int n_min;
-  int n_max;
-  int delta;
-  int gpus;
-  int* list = NULL;
-
-  if (argc < 5) {
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  type = strToType(argv[1]);
-  if (type == nccl_NUM_TYPES) {
-    printf("Invalid <type> '%s'\n", argv[1]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  op = strToOp(argv[2]);
-  if (op == nccl_NUM_OPS) {
-    printf("Invalid <op> '%s'\n", argv[2]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_min = strToPosInt(argv[3]);
-  if (n_min < 1) {
-    printf("Invalid <n_min> '%s'\n", argv[3]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_max = strToPosInt(argv[4]);
-  if (n_max < n_min) {
-    printf("Invalid <n_max> '%s'\n", argv[4]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  if (argc > 5) {
-    delta = strToPosInt(argv[5]);
-    if (delta < 1) {
-      printf("Invalid <delta> '%s'\n", argv[5]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
-  }
-
-  if (argc > 6) {
-    gpus = strToPosInt(argv[6]);
-    if (gpus < 1) {
-      printf("Invalid <gpus> '%s'\n", argv[6]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    gpus = nvis;
-  }
-
-  list = (int*)malloc(gpus*sizeof(int));
-
-  if (argc > 7 && argc != 7+gpus) {
-    printf("If given, GPU list must be fully specified.\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    if(argc > 7) {
-      list[g] = strToNonNeg(argv[7+g]);
-      if (list[g] < 0) {
-        printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      } else if (list[g] >= nvis) {
-        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      }
-    } else {
-      list[g] = g % nvis;
-    }
-  }
-
-  size_t word = wordSize(type);
-  size_t max_size = n_max * word;
-  void* refout;
-  CUDACHECK(cudaMallocHost(&refout, max_size));
-
-  void **input, **output;
-  double** localError;
-  ncclComm_t* comm;
-  cudaStream_t* stream;
-
-  input = (void**)malloc(gpus*sizeof(void*));
-  output = (void**)malloc(gpus*sizeof(void*));
-  localError = (double**)malloc(gpus*sizeof(double*));
-  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
-  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
-
-  for(int g=0; g<gpus; ++g) {
-    char busid[32] = {0};
-    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
-    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
-
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaMalloc(&input[g],  max_size));
-    CUDACHECK(cudaMalloc(&output[g], max_size));
-    CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
-    CUDACHECK(cudaStreamCreate(&stream[g]));
-    makeRandom(input[g], n_max, type, 42+g);
-
-    if (g == 0)
-      CUDACHECK(cudaMemcpy(refout, input[g], max_size, cudaMemcpyDeviceToHost));
-    else
-      accVec(refout, input[g], n_max, type, op);
-  }
-
-  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
-
-  printf("       BYTES ERROR       MSEC  ALGBW  BUSBW\n");
-
-  for(int n=n_min; n<=n_max; n+=delta) {
-    size_t bytes = word * n;
-
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaMemsetAsync(output[g], 0, bytes, stream[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      NCCLCHECK(ncclAllReduce(input[g], output[g], n, type, op, comm[g], stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-    auto stop = std::chrono::high_resolution_clock::now();
-    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
-        (stop - start).count() * 1000.0;
-
-    double max_error = 0.0;
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      maxDiff(localError[g], output[g], refout, n, type, stream[g]);
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-      max_error = max(max_error, *localError[g]);
-    }
-
-    double mb = (double)bytes * 1.e-6;
-    double algbw = mb / ms;
-    double busbw = algbw * (double)(2*gpus - 2) / (double)gpus;
-    printf("%12lu %5.0le %10.3lf %6.2lf %6.2lf\n",
-        n*word, max_error, ms, algbw, busbw);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamDestroy(stream[g]));
-    ncclCommDestroy(comm[g]);
-    CUDACHECK(cudaFree(input[g]));
-    CUDACHECK(cudaFree(output[g]));
-    CUDACHECK(cudaFreeHost(localError[g]));
-  }
-
-  free(localError);
-  free(output);
-  free(input);
-  free(comm);
-  free(stream);
-  CUDACHECK(cudaFreeHost(refout));
-  exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/all_reduce_test.cu b/third_party/nccl/test/single/all_reduce_test.cu
deleted file mode 100644
index 1935a38fa7..0000000000
--- a/third_party/nccl/test/single/all_reduce_test.cu
+++ /dev/null
@@ -1,301 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-int csv = false;
-int errors = 0;
-double avg_bw = 0.0;
-int avg_count = 0;
-bool is_reduction = true;
-
-template<typename T>
-void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
-    const ncclRedOp_t op, ncclComm_t* comms, const std::vector<int>& dList) {
-  // initialize data
-  T* buffer = (T*)malloc(N * sizeof(T));
-  T* result = (T*)malloc(N * sizeof(T));
-  memset(buffer, 0, N * sizeof(T));
-  memset(result, 0, N * sizeof(T));
-
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamCreate(s+i));
-    CUDACHECK(cudaMemset(recvbuff[i], 0, N * sizeof(T)));
-    Randomize(sendbuff[i], N, i);
-    if(i == 0) {
-      CUDACHECK(cudaMemcpy(result, sendbuff[i], N*sizeof(T), cudaMemcpyDeviceToHost));
-    } else {
-      Accumulate<T>(result, sendbuff[i], N, op);
-    }
-  }
-
-  // warm up GPU
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024), type, op, comms[i], s[i]));
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamSynchronize(s[i]));
-  }
-
-//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
-  {
-    int n = N;
-    printf((csv) ? "%i,%i,%s,%s," : "%12i  %12i  %6s  %6s",
-        (int) (n * sizeof(T)), n, TypeName(type).c_str(),
-        OperationName(op).c_str());
-
-    // do out-of-place reduction first
-    nvtxRangePushA("out of place");
-    auto start = std::chrono::high_resolution_clock::now();
-    //for (int i=0; i<100; i++) {
-      for (int i = 0; i < nDev; ++i) {
-        CUDACHECK(cudaSetDevice(dList[i]));
-        NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op,
-            comms[i], s[i]));
-      }
-    //}
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-    nvtxRangePop();
-
-    nvtxRangePushA("out of place bookkeeping");
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count(); // / 100.0;
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw * (double)(2 * nDev - 2) / (double)nDev;
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(recvbuff[i], result, N);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf((csv)?"%f,%f,%f,%le,":"  %7.3f  %5.2f  %5.2f  %7.0le",
-        elapsedSec * 1.0E3, algbw, busbw, maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-    nvtxRangePop();
-  }
-
-
-//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
-  {
-    int n = N;
-    // now do in-place reduction
-    nvtxRangePushA("in place");
-    auto start = std::chrono::high_resolution_clock::now();
-    //for (int i=0; i<100; i++) {
-      for (int i = 0; i < nDev; ++i) {
-        CUDACHECK(cudaSetDevice(dList[i]));
-        NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op,
-            comms[i], s[i]));
-      }
-    //}
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-    nvtxRangePop();
-
-    nvtxRangePushA("in place bookkeeping");
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count(); // / 100.0;
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw * (double)(2 * nDev - 2) / (double)nDev;
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(sendbuff[i], result, N);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf((csv)?"%f,%f,%f,%le,":"  %7.3f  %5.2f  %5.2f  %7.0le\n",
-        elapsedSec * 1.0E3, algbw, busbw, maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-    nvtxRangePop();
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamDestroy(s[i]));
-  }
-  free(s);
-  free(buffer);
-  free(result);
-}
-
-template<typename T>
-void RunTests(const int N, const ncclDataType_t type, ncclComm_t* comms,
-    const std::vector<int>& dList) {
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  T** sendbuff = (T**)malloc(nDev * sizeof(T*));
-  T** recvbuff = (T**)malloc(nDev * sizeof(T*));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaMalloc(sendbuff + i, N * sizeof(T)));
-    CUDACHECK(cudaMalloc(recvbuff + i, N * sizeof(T)));
-  }
-
-  for (ncclRedOp_t op : { ncclSum, ncclProd, ncclMax, ncclMin }) {
-//  for (ncclRedOp_t op : { ncclSum }) {
-    RunTest<T>(sendbuff, recvbuff, N, type, op, comms, dList);
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaFree(sendbuff[i]));
-    CUDACHECK(cudaFree(recvbuff[i]));
-  }
-
-  free(sendbuff);
-  free(recvbuff);
-}
-
-void usage() {
-  printf("Tests nccl AllReduce with user supplied arguments.\n"
-      "    Usage: all_reduce_test <data size in bytes> [number of GPUs] "
-      "[GPU 0] [GPU 1] ...\n\n");
-}
-
-int main(int argc, char* argv[]) {
-  int nVis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nVis));
-
-  int N = 0;
-  if (argc > 1) {
-    int t = sscanf(argv[1], "%d", &N);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    printf("Error: must specify at least data size in bytes!\n\n");
-    usage();
-    exit(EXIT_FAILURE);
-  }
-
-  int nDev = nVis;
-  if (argc > 2) {
-    int t = sscanf(argv[2], "%d", &nDev);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  }
-  std::vector<int> dList(nDev);
-  for (int i = 0; i < nDev; ++i)
-    dList[i] = i % nVis;
-
-  if (argc > 3) {
-    if (argc - 3 != nDev) {
-      printf("Error: insufficient number of GPUs in list\n\n");
-      usage();
-      exit(EXIT_FAILURE);
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
-      if (t == 0) {
-        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
-        usage();
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
-  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
-
-  if (!csv) {
-    printf("# Using devices\n");
-    for (int g = 0; g < nDev; ++g) {
-      int cudaDev;
-      int rank;
-      cudaDeviceProp prop;
-      NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
-      NCCLCHECK(ncclCommUserRank(comms[g], &rank));
-      CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-      printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
-          prop.pciBusID, prop.name);
-    }
-    printf("\n");
-
-    printf("# %10s  %12s  %6s  %6s        out-of-place                    in-place\n", "", "", "", "");
-    printf("# %10s  %12s  %6s  %6s  %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", "op",
-               "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
-  }
-  else {
-    printf("B,N,type,op,oop_time,oop_algbw,oop_busbw,oop_res,ip_time,ip_algbw,ip_busbw,ip_res\n");
-  }
-
-  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
-  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
-#ifdef CUDA_HAS_HALF
-  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
-#endif
-  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
-  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
-  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
-  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
-
-  printf("\n");
-
-  for(int i=0; i<nDev; ++i)
-    ncclCommDestroy(comms[i]);
-  free(comms);
-
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
-  avg_bw /= avg_count;
-
-  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
-  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
-  printf("\n");
-  if (errors || avg_bw < check_avg_bw)
-    exit(EXIT_FAILURE);
-  else 
-    exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/broadcast_scan.cu b/third_party/nccl/test/single/broadcast_scan.cu
deleted file mode 100644
index ea11c7d96d..0000000000
--- a/third_party/nccl/test/single/broadcast_scan.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <float.h>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-void showUsage(const char* bin) {
-  printf("\n"
-         "Usage: %s <type> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
-         "Where:\n"
-#ifdef CUDA_HAS_HALF
-         "    type   =   [char|int|half|float|double|int64|uint64]\n"
-#else
-         "    type   =   [char|int|float|double|int64|uint64]\n"
-#endif
-         "    n_min  >   0\n"
-         "    n_max  >=  n_min\n"
-         "    delta  >   0\n\n", bin);
-  return;
-}
-
-int main(int argc, char* argv[]) {
-  int nvis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nvis));
-  if (nvis == 0) {
-    printf("No GPUs found\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  ncclDataType_t type;
-  int n_min;
-  int n_max;
-  int delta;
-  int gpus;
-  int* list = NULL;
-
-  if (argc < 4) {
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  type = strToType(argv[1]);
-  if (type == nccl_NUM_TYPES) {
-    printf("Invalid <type> '%s'\n", argv[1]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_min = strToPosInt(argv[2]);
-  if (n_min < 1) {
-    printf("Invalid <n_min> '%s'\n", argv[2]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_max = strToPosInt(argv[3]);
-  if (n_max < n_min) {
-    printf("Invalid <n_max> '%s'\n", argv[3]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  if (argc > 4) {
-    delta = strToPosInt(argv[4]);
-    if (delta < 1) {
-      printf("Invalid <delta> '%s'\n", argv[4]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
-  }
-
-  if (argc > 5) {
-    gpus = strToPosInt(argv[5]);
-    if (gpus < 1) {
-      printf("Invalid <gpus> '%s'\n", argv[5]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    gpus = nvis;
-  }
-
-  list = (int*)malloc(gpus*sizeof(int));
-
-  if (argc > 6 && argc != 6+gpus) {
-    printf("If given, GPU list must be fully specified.\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    if(argc > 6) {
-      list[g] = strToNonNeg(argv[6+g]);
-      if (list[g] < 0) {
-        printf("Invalid GPU%d '%s'\n", g, argv[6+g]);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      } else if (list[g] >= nvis) {
-        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      }
-    } else {
-      list[g] = g % nvis;
-    }
-  }
-
-  size_t word = wordSize(type);
-  size_t max_size = n_max * word;
-  void* refout;
-  CUDACHECK(cudaMallocHost(&refout, max_size));
-
-  void** io;
-  double* localError;
-  ncclComm_t* comm;
-  cudaStream_t* stream;
-
-  io = (void**)malloc(gpus*sizeof(void*));
-  CUDACHECK(cudaMallocHost(&localError, gpus*sizeof(double)));
-  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
-  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
-
-  for(int g=0; g<gpus; ++g) {
-    char busid[32] = {0};
-    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
-    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
-
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamCreate(&stream[g]));
-    CUDACHECK(cudaMalloc(&io[g], max_size));
-    if(g == 0) {
-      makeRandom(io[g], n_max, type, 42+g);
-      CUDACHECK(cudaMemcpy(refout, io[g], max_size, cudaMemcpyDeviceToHost));
-    }
-  }
-
-  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
-
-  printf("       BYTES ERROR       MSEC     BW\n");
-
-  for(int n=n_min; n<=n_max; n+=delta) {
-    size_t bytes = word * n;
-
-    for(int g=1; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaMemsetAsync(io[g], 0, bytes, stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[0]));
-    }
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      NCCLCHECK(ncclBcast(io[g], n, type, 0, comm[g], stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-    auto stop = std::chrono::high_resolution_clock::now();
-    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
-        (stop - start).count() * 1000.0;
-
-    for(int g=1; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      maxDiff(localError+g, io[g], refout, n, type, stream[g]);
-    }
-    double maxError = 0.0;
-    for(int g=1; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-      maxError = max(maxError, localError[g]);
-    }
-
-    double mb = (double)bytes * 1.e-6;
-    double algbw = mb / ms;
-    printf("%12lu %5.0le %10.3lf %6.2lf\n",
-        n*word, maxError, ms, algbw);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamDestroy(stream[g]));
-    ncclCommDestroy(comm[g]);
-    CUDACHECK(cudaFree(io[g]));
-  }
-
-  free(io);
-  free(comm);
-  free(stream);
-  CUDACHECK(cudaFreeHost(refout));
-  CUDACHECK(cudaFreeHost(localError));
-  exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/broadcast_test.cu b/third_party/nccl/test/single/broadcast_test.cu
deleted file mode 100644
index 6b1e04fb9d..0000000000
--- a/third_party/nccl/test/single/broadcast_test.cu
+++ /dev/null
@@ -1,235 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-#include "nccl.h"
-#include "test_utilities.h"
-
-int errors = 0;
-double avg_bw = 0.0;
-int avg_count = 0;
-bool is_reduction = false;
-
-template<typename T>
-void RunTest(T** buff, const int N, const ncclDataType_t type, const int root,
-    ncclComm_t* const comms, const std::vector<int>& dList) {
-  // initialize data
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
-  T* buffer = (T*)malloc(N * sizeof(T));
-  T* result = (T*)malloc(N * sizeof(T));
-  memset(result, 0, N * sizeof(T));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamCreate(s+i));
-
-    if (i == root) {
-      Randomize(buff[root], N, root);
-      CUDACHECK(cudaMemcpy(result, buff[root], N * sizeof(T),
-          cudaMemcpyDeviceToHost));
-    } else {
-      CUDACHECK(cudaMemset(buff[i], 0, N * sizeof(T)));
-    }
-
-    CUDACHECK(cudaDeviceSynchronize());
-  }
-
-  // warm up GPU
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    NCCLCHECK(ncclBcast((void*)buff[i], std::min(32 * 1024, N), type, root, comms[i], s[i]));
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamSynchronize(s[i]));
-  }
-
-//  for (int n = 1; n <= N; n = n << 1)
-  {
-    int n = N;
-    printf("%12i  %12i  %6s  %4i", (int)(n * sizeof(T)), n,
-        TypeName(type).c_str(), root);
-
-    auto start = std::chrono::high_resolution_clock::now();
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      NCCLCHECK(ncclBcast((void*)buff[i], n, type, root, comms[i], s[i]));
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count();
-    double algbw = (double)(n * sizeof(T)) / 1.0E9  / elapsedSec;
-    double busbw = algbw;
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(buff[i], result, n);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf("  %7.3f  %5.2f  %5.2f  %7.0le\n", elapsedSec * 1.0E3, algbw, busbw,
-            maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-  }
-
-  for(int i=0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamDestroy(s[i]));
-  }
-  free(s);
-  free(buffer);
-  free(result);
-}
-
-template<typename T>
-void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
-    const std::vector<int>& dList) {
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  T** buff = (T**)malloc(nDev * sizeof(T*));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaMalloc(buff + i, N * sizeof(T)));
-  }
-
-  //for (int root = 1; root < 2; ++root) {
-  for (int root = 0; root < nDev; ++root) {
-    RunTest<T>(buff, N, type, root, comms, dList);
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaFree(buff[i]));
-  }
-
-  free(buff);
-}
-
-void usage() {
-  printf("Tests nccl Broadcast with user supplied arguments.\n"
-      "    Usage: broadcast_test <data size in bytes> [number of GPUs] "
-      "[GPU 0] [GPU 1] ...\n\n");
-}
-
-int main(int argc, char* argv[]) {
-  int nVis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nVis));
-
-  unsigned long long N = 0;
-  if (argc > 1) {
-    int t = sscanf(argv[1], "%llu", &N);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    printf("Error: must specify at least data size in bytes!\n\n");
-    usage();
-    exit(EXIT_FAILURE);
-  }
-
-  int nDev = nVis;
-  if (argc > 2) {
-    int t = sscanf(argv[2], "%d", &nDev);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  }
-  std::vector<int> dList(nDev);
-  for (int i = 0; i < nDev; ++i)
-    dList[i] = i % nVis;
-
-  if (argc > 3) {
-    if (argc - 3 != nDev) {
-      printf("Error: insufficient number of GPUs in list\n\n");
-      usage();
-      exit(EXIT_FAILURE);
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
-      if (t == 0) {
-        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
-        usage();
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);;
-  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
-
-  printf("# Using devices\n");
-  for (int g = 0; g < nDev; ++g) {
-    int cudaDev;
-    int rank;
-    cudaDeviceProp prop;
-    NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
-    NCCLCHECK(ncclCommUserRank(comms[g], &rank));
-    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-    printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
-        prop.pciBusID, prop.name);
-  }
-  printf("\n");
-
-  printf("# %10s  %12s  %6s  %4s  %7s  %5s  %5s  %7s\n",
-      "bytes", "N", "type", "root", "time", "algbw", "busbw", "delta");
-
-  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
-  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
-#ifdef CUDA_HAS_HALF
-  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
-#endif
-  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
-  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
-  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
-  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
-
-  printf("\n");
-
-  for(int i = 0; i < nDev; ++i)
-    ncclCommDestroy(comms[i]);
-  free(comms);
-
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
-  avg_bw /= avg_count;
-
-  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
-  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
-  printf("\n");
-  if (errors || avg_bw < check_avg_bw)
-    exit(EXIT_FAILURE);
-  else 
-    exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/reduce_scan.cu b/third_party/nccl/test/single/reduce_scan.cu
deleted file mode 100644
index f42643eb4d..0000000000
--- a/third_party/nccl/test/single/reduce_scan.cu
+++ /dev/null
@@ -1,238 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <float.h>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-void showUsage(const char* bin) {
-  printf("\n"
-         "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
-         "Where:\n"
-#ifdef CUDA_HAS_HALF
-         "    type   =   [char|int|half|float|double|int64|uint64]\n"
-#else
-         "    type   =   [char|int|float|double|int64|uint64]\n"
-#endif
-         "    op     =   [sum|prod|max|min]\n"
-         "    n_min  >   0\n"
-         "    n_max  >=  n_min\n"
-         "    delta  >   0\n\n", bin);
-  return;
-}
-
-int main(int argc, char* argv[]) {
-  int nvis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nvis));
-  if (nvis == 0) {
-    printf("No GPUs found\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  ncclDataType_t type;
-  ncclRedOp_t op;
-  int n_min;
-  int n_max;
-  int delta;
-  int gpus;
-  int* list = NULL;
-
-  if (argc < 5) {
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  type = strToType(argv[1]);
-  if (type == nccl_NUM_TYPES) {
-    printf("Invalid <type> '%s'\n", argv[1]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  op = strToOp(argv[2]);
-  if (op == nccl_NUM_OPS) {
-    printf("Invalid <op> '%s'\n", argv[2]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_min = strToPosInt(argv[3]);
-  if (n_min < 1) {
-    printf("Invalid <n_min> '%s'\n", argv[3]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_max = strToPosInt(argv[4]);
-  if (n_max < n_min) {
-    printf("Invalid <n_max> '%s'\n", argv[4]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  if (argc > 5) {
-    delta = strToPosInt(argv[5]);
-    if (delta < 1) {
-      printf("Invalid <delta> '%s'\n", argv[5]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
-  }
-
-  if (argc > 6) {
-    gpus = strToPosInt(argv[6]);
-    if (gpus < 1) {
-      printf("Invalid <gpus> '%s'\n", argv[6]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    gpus = nvis;
-  }
-
-  list = (int*)malloc(gpus*sizeof(int));
-
-  if (argc > 7 && argc != 7+gpus) {
-    printf("If given, GPU list must be fully specified.\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    if(argc > 7) {
-      list[g] = strToNonNeg(argv[7+g]);
-      if (list[g] < 0) {
-        printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      } else if (list[g] >= nvis) {
-        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      }
-    } else {
-      list[g] = g % nvis;
-    }
-  }
-
-  size_t word = wordSize(type);
-  size_t max_size = n_max * word;
-  void* refout;
-  CUDACHECK(cudaMallocHost(&refout, max_size));
-
-  void** input;
-  void* output; // always goes on rank 0
-  double* maxError;
-  ncclComm_t* comm;
-  cudaStream_t* stream;
-
-  input = (void**)malloc(gpus*sizeof(void*));
-  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
-  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
-
-  for(int g=0; g<gpus; ++g) {
-    char busid[32] = {0};
-    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
-    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
-
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamCreate(&stream[g]));
-    CUDACHECK(cudaMalloc(&input[g],  max_size));
-    makeRandom(input[g], n_max, type, 42+g);
-
-    if (g == 0) {
-      CUDACHECK(cudaMalloc(&output, max_size));
-      CUDACHECK(cudaMallocHost(&maxError, sizeof(double)));
-      CUDACHECK(cudaMemcpy(refout, input[g], max_size, cudaMemcpyDeviceToHost));
-    } else {
-      accVec(refout, input[g], n_max, type, op);
-    }
-  }
-
-  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
-
-  printf("       BYTES ERROR       MSEC     BW\n");
-
-  for(int n=n_min; n<=n_max; n+=delta) {
-    size_t bytes = word * n;
-
-    CUDACHECK(cudaSetDevice(list[0]));
-    CUDACHECK(cudaMemsetAsync(output, 0, bytes, stream[0]));
-    for(int g=0; g<gpus; ++g)
-      CUDACHECK(cudaStreamSynchronize(stream[0]));
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      NCCLCHECK(ncclReduce(input[g], output, n, type, op, 0, comm[g], stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-    auto stop = std::chrono::high_resolution_clock::now();
-    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
-        (stop - start).count() * 1000.0;
-
-    CUDACHECK(cudaSetDevice(list[0]));
-    maxDiff(maxError, output, refout, n, type, stream[0]);
-    CUDACHECK(cudaStreamSynchronize(stream[0]));
-
-    double mb = (double)bytes * 1.e-6;
-    double algbw = mb / ms;
-    printf("%12lu %5.0le %10.3lf %6.2lf\n",
-        n*word, *maxError, ms, algbw);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamDestroy(stream[g]));
-    ncclCommDestroy(comm[g]);
-    CUDACHECK(cudaFree(input[g]));
-    if(g == 0) {
-      CUDACHECK(cudaFree(output));
-      CUDACHECK(cudaFreeHost(maxError));
-    }
-  }
-
-  free(input);
-  free(comm);
-  free(stream);
-  CUDACHECK(cudaFreeHost(refout));
-  exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/reduce_scatter_scan.cu b/third_party/nccl/test/single/reduce_scatter_scan.cu
deleted file mode 100644
index 8c37508c4d..0000000000
--- a/third_party/nccl/test/single/reduce_scatter_scan.cu
+++ /dev/null
@@ -1,249 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <float.h>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-void showUsage(const char* bin) {
-  printf("\n"
-         "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
-         "Where:\n"
-#ifdef CUDA_HAS_HALF
-         "    type   =   [char|int|half|float|double|int64|uint64]\n"
-#else
-         "    type   =   [char|int|float|double|int64|uint64]\n"
-#endif
-         "    op     =   [sum|prod|max|min]\n"
-         "    n_min  >   0\n"
-         "    n_max  >=  n_min\n"
-         "    delta  >   0\n\n", bin);
-  return;
-}
-
-int main(int argc, char* argv[]) {
-  int nvis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nvis));
-  if (nvis == 0) {
-    printf("No GPUs found\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  ncclDataType_t type;
-  ncclRedOp_t op;
-  int n_min;
-  int n_max;
-  int delta;
-  int gpus;
-  int* list = NULL;
-
-  if (argc < 5) {
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  type = strToType(argv[1]);
-  if (type == nccl_NUM_TYPES) {
-    printf("Invalid <type> '%s'\n", argv[1]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  op = strToOp(argv[2]);
-  if (op == nccl_NUM_OPS) {
-    printf("Invalid <op> '%s'\n", argv[2]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_min = strToPosInt(argv[3]);
-  if (n_min < 1) {
-    printf("Invalid <n_min> '%s'\n", argv[3]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_max = strToPosInt(argv[4]);
-  if (n_max < n_min) {
-    printf("Invalid <n_max> '%s'\n", argv[4]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  if (argc > 5) {
-    delta = strToPosInt(argv[5]);
-    if (delta < 1) {
-      printf("Invalid <delta> '%s'\n", argv[5]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
-  }
-
-  if (argc > 6) {
-    gpus = strToPosInt(argv[6]);
-    if (gpus < 1) {
-      printf("Invalid <gpus> '%s'\n", argv[6]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    gpus = nvis;
-  }
-
-  list = (int*)malloc(gpus*sizeof(int));
-
-  if (argc > 7 && argc != 7+gpus) {
-    printf("If given, GPU list must be fully specified.\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    if(argc > 7) {
-      list[g] = strToNonNeg(argv[7+g]);
-      if (list[g] < 0) {
-        printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      } else if (list[g] >= nvis) {
-        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      }
-    } else {
-      list[g] = g % nvis;
-    }
-  }
-
-  size_t word = wordSize(type);
-  size_t max_output = n_max * word;
-  size_t max_input = gpus * max_output;
-  void* refout;
-  CUDACHECK(cudaMallocHost(&refout, max_input)); // contains entire reduction
-
-  void **input, **output;
-  double** localError;
-  ncclComm_t* comm;
-  cudaStream_t* stream;
-
-  input = (void**)malloc(gpus*sizeof(void*));
-  output = (void**)malloc(gpus*sizeof(void*));
-  localError = (double**)malloc(gpus*sizeof(double*));
-  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
-  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
-
-  for(int g=0; g<gpus; ++g) {
-    char busid[32] = {0};
-    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
-    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
-
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaMalloc(&input[g],  max_input));
-    CUDACHECK(cudaMalloc(&output[g], max_output));
-    CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
-    CUDACHECK(cudaStreamCreate(&stream[g]));
-    makeRandom(input[g], n_max*gpus, type, 42+g);
-
-    if (g == 0)
-      CUDACHECK(cudaMemcpy(refout, input[g], max_input, cudaMemcpyDeviceToHost));
-    else
-      accVec(refout, input[g], n_max*gpus, type, op);
-  }
-
-  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
-
-  printf("       BYTES ERROR       MSEC  ALGBW  BUSBW\n");
-
-  for(int n=n_min; n<=n_max; n+=delta) {
-    size_t bytes = word * n;
-
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaMemsetAsync(output[g], 0, bytes, stream[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      NCCLCHECK(ncclReduceScatter(input[g], output[g], n, type, op, comm[g], stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-    auto stop = std::chrono::high_resolution_clock::now();
-    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
-        (stop - start).count() * 1000.0;
-
-    double max_error = 0.0;
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      void* myRef = (void*)((char*)refout + g*bytes);
-      maxDiff(localError[g], output[g], myRef, n, type, stream[g]);
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-      max_error = max(max_error, *localError[g]);
-    }
-
-    double mb = (double)bytes * 1.e-6;
-    double algbw = mb / ms;
-    double busbw = algbw * (double)(gpus - 1);
-    printf("%12lu %5.0le %10.3lf %6.2lf %6.2lf\n",
-        n*word, max_error, ms, algbw, busbw);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamDestroy(stream[g]));
-    ncclCommDestroy(comm[g]);
-    CUDACHECK(cudaFree(input[g]));
-    CUDACHECK(cudaFree(output[g]));
-    CUDACHECK(cudaFreeHost(localError[g]));
-  }
-
-  free(localError);
-  free(output);
-  free(input);
-  free(comm);
-  free(stream);
-  CUDACHECK(cudaFreeHost(refout));
-  exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/reduce_scatter_test.cu b/third_party/nccl/test/single/reduce_scatter_test.cu
deleted file mode 100644
index b702800925..0000000000
--- a/third_party/nccl/test/single/reduce_scatter_test.cu
+++ /dev/null
@@ -1,285 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-#include "nccl.h"
-#include "test_utilities.h"
-
-int errors = 0;
-double avg_bw = 0.0;
-int avg_count = 0;
-bool is_reduction = true;
-
-template<typename T>
-void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
-    const ncclRedOp_t op, ncclComm_t* const comms, const std::vector<int>& dList) {
-  // initialize data
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
-
-  T* buffer = (T*)malloc(N * nDev * sizeof(T));
-  T* result = (T*)malloc(N * nDev * sizeof(T));
-  memset(buffer, 0, N * nDev * sizeof(T));
-  memset(result, 0, N * nDev * sizeof(T));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamCreate(s+i));
-    CUDACHECK(cudaMemset(recvbuff[i], 0, N * sizeof(T)));
-    Randomize(sendbuff[i], N * nDev, i);
-
-    if (i == 0) {
-      CUDACHECK(cudaMemcpy(result, sendbuff[i], N * nDev * sizeof(T),
-          cudaMemcpyDeviceToHost));
-    } else {
-      Accumulate<T>(result, sendbuff[i], N * nDev, op);
-    }
-  }
-
-  // warm up GPU
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i],
-        std::min(N, 1024 * 1024), type, op, comms[i], s[i]));
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamSynchronize(s[i]));
-  }
-
-//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
-  {
-    int n = N;
-    printf("%12i  %12i  %6s  %6s", (int)(n * sizeof(T)), n,
-        TypeName(type).c_str(), OperationName(op).c_str());
-
-    // do out-of-place reduction first
-    auto start = std::chrono::high_resolution_clock::now();
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], n, type,
-          op, comms[i], s[i]));
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count();
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw * (double)(nDev - 1);
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(recvbuff[i], result+i*n, n);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf("  %7.3f  %5.2f  %5.2f  %7.0le", elapsedSec * 1.0E3, algbw, busbw,
-        maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-  }
-
-  {
-    // now do in-place reduction
-    int n = N;
-
-    auto start = std::chrono::high_resolution_clock::now();
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)sendbuff[i], n, type,
-          op, comms[i], s[i]));
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count();
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw * (double)(nDev - 1);
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(sendbuff[i], result+i*n, n);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf("  %7.3f  %5.2f  %5.2f  %7.0le\n", elapsedSec * 1.0E3, algbw, busbw,
-        maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamDestroy(s[i]));
-  }
-  free(s);
-  free(buffer);
-  free(result);
-}
-
-template<typename T>
-void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
-    const std::vector<int>& dList) {
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  T** sendbuff = (T**)malloc(nDev * sizeof(T*));
-  T** recvbuff = (T**)malloc(nDev * sizeof(T*));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaMalloc(sendbuff + i, N * nDev * sizeof(T)));
-    CUDACHECK(cudaMalloc(recvbuff + i, N * sizeof(T)));
-  }
-
-  for (ncclRedOp_t op : { ncclSum, ncclProd, ncclMax, ncclMin }) {
-//  for (ncclRedOp_t op : { ncclSum }) {
-    RunTest<T>(sendbuff, recvbuff, N, type, op, comms, dList);
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaFree(sendbuff[i]));
-    CUDACHECK(cudaFree(recvbuff[i]));
-  }
-
-  free(sendbuff);
-  free(recvbuff);
-}
-
-void usage() {
-  printf("Tests nccl ReduceScatter with user supplied arguments.\n"
-      "    Usage: all_reduce_test <data size in bytes> [number of GPUs] "
-      "[GPU 0] [GPU 1] ...\n\n");
-}
-
-int main(int argc, char* argv[]) {
-  int nVis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nVis));
-
-  int N = 0;
-  if (argc > 1) {
-    int t = sscanf(argv[1], "%d", &N);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    printf("Error: must specify at least data size in bytes!\n\n");
-    usage();
-    exit(EXIT_FAILURE);
-  }
-
-  int nDev = nVis;
-  if (argc > 2) {
-    int t = sscanf(argv[2], "%d", &nDev);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  }
-  std::vector<int> dList(nDev);
-  for (int i = 0; i < nDev; ++i)
-    dList[i] = i % nVis;
-
-  if (argc > 3) {
-    if (argc - 3 != nDev) {
-      printf("Error: insufficient number of GPUs in list\n\n");
-      usage();
-      exit(EXIT_FAILURE);
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
-      if (t == 0) {
-        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
-        usage();
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
-  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
-
-  printf("# Using devices\n");
-  for (int g = 0; g < nDev; ++g) {
-    int cudaDev;
-    int rank;
-    cudaDeviceProp prop;
-    NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
-    NCCLCHECK(ncclCommUserRank(comms[g], &rank));
-    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-    printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
-        prop.pciBusID, prop.name);
-  }
-  printf("\n");
-
-  printf("# %10s  %12s  %6s  %6s        out-of-place                      "
-      "in-place\n", "", "", "", "");
-  printf("# %10s  %12s  %6s  %6s  %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n",
-      "bytes", "N", "type", "op", "time", "algbw", "busbw", "delta", "time",
-      "algbw", "busbw", "delta");
-
-  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
-  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
-#ifdef CUDA_HAS_HALF
-  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
-#endif
-  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
-  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
-  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
-  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
-
-  printf("\n");
-
-  for(int i=0; i<nDev; ++i)
-    ncclCommDestroy(comms[i]);
-  free(comms);
-
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
-  avg_bw /= avg_count;
-
-  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
-  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
-  printf("\n");
-  if (errors || avg_bw < check_avg_bw)
-    exit(EXIT_FAILURE);
-  else 
-    exit(EXIT_SUCCESS);
-}
-
diff --git a/third_party/nccl/test/single/reduce_test.cu b/third_party/nccl/test/single/reduce_test.cu
deleted file mode 100644
index 6abb49c45f..0000000000
--- a/third_party/nccl/test/single/reduce_test.cu
+++ /dev/null
@@ -1,299 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-int csv = false;
-int errors = 0;
-double avg_bw = 0.0;
-int avg_count = 0;
-bool is_reduction = true;
-
-template<typename T>
-void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
-    const ncclRedOp_t op, int root, ncclComm_t* const comms,
-    const std::vector<int>& dList) {
-
-  // initialize data
-  T* buffer = (T*)malloc(N * sizeof(T));
-  T* result = (T*)malloc(N * sizeof(T));
-  memset(buffer, 0, N * sizeof(T));
-  memset(result, 0, N * sizeof(T));
-
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamCreate(s+i));
-    CUDACHECK(cudaMemset(recvbuff[i], 0, N * sizeof(T)));
-    Randomize(sendbuff[i], N, i);
-    if(i == 0) {
-      CUDACHECK(cudaMemcpy(result, sendbuff[i], N*sizeof(T), cudaMemcpyDeviceToHost));
-    } else {
-      Accumulate<T>(result, sendbuff[i], N, op);
-    }
-  }
-
-  // warm up GPU
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024),
-        type, op, root, comms[i], s[i]));
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamSynchronize(s[i]));
-  }
-
-//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
-  {
-    int n = N;
-    printf((csv) ? "%i,%i,%s,%s,%d," : "%12i  %12i  %6s  %6s %4d",
-        (int) (n * sizeof(T)), n, TypeName(type).c_str(),
-        OperationName(op).c_str(), root);
-
-    // do out-of-place reduction first
-    nvtxRangePushA("out of place");
-    auto start = std::chrono::high_resolution_clock::now();
-    //for (int i=0; i<100; i++) {
-      for (int i = 0; i < nDev; ++i) {
-        CUDACHECK(cudaSetDevice(dList[i]));
-        NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op,
-            root, comms[i], s[i]));
-      }
-    //}
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-    nvtxRangePop();
-
-    nvtxRangePushA("out of place bookkeeping");
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count(); // / 100.0;
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw;
-
-    CUDACHECK(cudaSetDevice(dList[root]));
-    double maxDelta = CheckDelta<T>(recvbuff[root], result, N);
-
-    printf((csv)?"%f,%f,%f,%le,":"  %7.3f  %5.2f  %5.2f  %7.0le",
-        elapsedSec * 1.0E3, algbw, busbw, maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-    nvtxRangePop();
-  }
-
-
-//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
-  {
-    int n = N;
-    // now do in-place reduction
-    nvtxRangePushA("in place");
-    auto start = std::chrono::high_resolution_clock::now();
-    //for (int i=0; i<100; i++) {
-      for (int i = 0; i < nDev; ++i) {
-        CUDACHECK(cudaSetDevice(dList[i]));
-        NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op,
-            root, comms[i], s[i]));
-      }
-    //}
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-    nvtxRangePop();
-
-    nvtxRangePushA("in place bookkeeping");
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count(); // / 100.0;
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw;
-
-    CUDACHECK(cudaSetDevice(dList[root]));
-    double maxDelta = CheckDelta<T>(sendbuff[root], result, N);
-
-    printf((csv)?"%f,%f,%f,%le,":"  %7.3f  %5.2f  %5.2f  %7.0le\n",
-        elapsedSec * 1.0E3, algbw, busbw, maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-    nvtxRangePop();
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamDestroy(s[i]));
-  }
-  free(s);
-  free(buffer);
-  free(result);
-}
-
-template<typename T>
-void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
-    const std::vector<int>& dList) {
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  T** sendbuff = (T**)malloc(nDev * sizeof(T*));
-  T** recvbuff = (T**)malloc(nDev * sizeof(T*));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaMalloc(sendbuff + i, N * sizeof(T)));
-    CUDACHECK(cudaMalloc(recvbuff + i, N * sizeof(T)));
-  }
-
-  for (ncclRedOp_t op : { ncclSum, ncclProd, ncclMax, ncclMin }) {
-//  for (ncclRedOp_t op : { ncclSum }) {
-    for(int root=0; root<nDev; ++root) {
-      RunTest<T>(sendbuff, recvbuff, N, type, op, root, comms, dList);
-    }
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaFree(sendbuff[i]));
-    CUDACHECK(cudaFree(recvbuff[i]));
-  }
-
-  free(sendbuff);
-  free(recvbuff);
-}
-
-void usage() {
-  printf("Tests nccl Reduce with user supplied arguments.\n"
-      "    Usage: reduce_test <data size in bytes> [number of GPUs] "
-      "[GPU 0] [GPU 1] ...\n\n");
-}
-
-int main(int argc, char* argv[]) {
-  int nVis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nVis));
-
-  int N = 0;
-  if (argc > 1) {
-    int t = sscanf(argv[1], "%d", &N);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    printf("Error: must specify at least data size in bytes!\n\n");
-    usage();
-    exit(EXIT_FAILURE);
-  }
-
-  int nDev = nVis;
-  if (argc > 2) {
-    int t = sscanf(argv[2], "%d", &nDev);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  }
-  std::vector<int> dList(nDev);
-  for (int i = 0; i < nDev; ++i)
-    dList[i] = i % nVis;
-
-  if (argc > 3) {
-    if (argc - 3 != nDev) {
-      printf("Error: insufficient number of GPUs in list\n\n");
-      usage();
-      exit(EXIT_FAILURE);
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
-      if (t == 0) {
-        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
-        usage();
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
-  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
-
-  if (!csv) {
-    printf("# Using devices\n");
-    for (int g = 0; g < nDev; ++g) {
-      int cudaDev;
-      int rank;
-      cudaDeviceProp prop;
-      NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
-      NCCLCHECK(ncclCommUserRank(comms[g], &rank));
-      CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-      printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
-        prop.pciBusID, prop.name);
-    }
-    printf("\n");
-
-    printf("# %10s  %12s  %6s  %6s  %4s        out-of-place                    in-place\n", "", "", "", "", "");
-    printf("# %10s  %12s  %6s  %6s  %4s %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n",
-               "bytes", "N", "type", "op", "root",
-               "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
-  }
-  else {
-    printf("B,N,type,op,root,oop_time,oop_algbw,oop_busbw,oop_res,ip_time,ip_algbw,ip_busbw,ip_res\n");
-  }
-
-  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
-  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
-#ifdef CUDA_HAS_HALF
-  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
-#endif
-  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
-  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
-  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
-  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
-
-  printf("\n");
-
-  for(int i = 0; i < nDev; ++i)
-    ncclCommDestroy(comms[i]);
-  free(comms);
-
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
-  avg_bw /= avg_count;
-
-  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
-  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
-  printf("\n");
-  if (errors || avg_bw < check_avg_bw)
-    exit(EXIT_FAILURE);
-  else 
-    exit(EXIT_SUCCESS);
-}
-
author	Teng Li <tengli@fb.com>	2018-10-04 11:31:26 -0700
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2018-10-04 11:42:17 -0700
commit	ae7a7fb398cb859732a5f6f0bfdfb87b803a2ade (patch)
tree	6bd56dd8fc590ce3baa40cdfceddd58cb2186218 /third_party
parent	6b79e16d6dbd5b3c71775c69a770a17cbe0b2f08 (diff)
download	pytorch-ae7a7fb398cb859732a5f6f0bfdfb87b803a2ade.tar.gz pytorch-ae7a7fb398cb859732a5f6f0bfdfb87b803a2ade.tar.bz2 pytorch-ae7a7fb398cb859732a5f6f0bfdfb87b803a2ade.zip