diff options
author | Cyprien Noel <cyprien.noel@gmail.com> | 2015-05-18 20:07:36 -0700 |
---|---|---|
committer | Evan Shelhamer <shelhamer@imaginarynumber.net> | 2015-08-09 15:13:11 -0700 |
commit | d2f045768cba7d494abb4d168fc366d6fce80b85 (patch) | |
tree | 032c5f952fb5b97868557f320a365f306733a0e0 /include | |
parent | bcc8f50a95ecad954d1887f3fb273eaa298e2274 (diff) | |
download | caffeonacl-d2f045768cba7d494abb4d168fc366d6fce80b85.tar.gz caffeonacl-d2f045768cba7d494abb4d168fc366d6fce80b85.tar.bz2 caffeonacl-d2f045768cba7d494abb4d168fc366d6fce80b85.zip |
Allocate host memory through cudaMallocHost
thanks to discussion by @thatguymike and @flx42
Diffstat (limited to 'include')
-rw-r--r-- | include/caffe/syncedmem.hpp | 31 |
1 files changed, 17 insertions, 14 deletions
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp index 4d339bf4..4a1a2f3f 100644 --- a/include/caffe/syncedmem.hpp +++ b/include/caffe/syncedmem.hpp @@ -8,26 +8,29 @@ namespace caffe { -// Theoretically, CaffeMallocHost and CaffeFreeHost should simply call the -// cudaMallocHost and cudaFree functions in order to create pinned memory. -// However, those codes rely on the existence of a cuda GPU (I don't know -// why that is a must since allocating memory should not be accessing the -// GPU resource, but it just creates an error as of Cuda 5.0) and will cause -// problem when running on a machine without GPU. Thus, we simply define -// these two functions for safety and possible future change if the problem -// of calling cuda functions disappears in a future version. -// -// In practice, although we are creating unpinned memory here, as long as we -// are constantly accessing them the memory pages almost always stays in -// the physical memory (assuming we have large enough memory installed), and -// does not seem to create a memory bottleneck here. - +// If CUDA is available and in GPU mode, host memory will be allocated pinned, +// using cudaMallocHost. It avoids dynamic pinning for transfers (DMA). +// The improvement in performance seems negligible in the single GPU case, +// but might be more significant for parallel training. Most importantly, +// it improved stability for large models on many GPUs. inline void CaffeMallocHost(void** ptr, size_t size) { +#ifndef CPU_ONLY + if (Caffe::mode() == Caffe::GPU) { + CUDA_CHECK(cudaMallocHost(ptr, size)); + return; + } +#endif *ptr = malloc(size); CHECK(*ptr) << "host allocation of size " << size << " failed"; } inline void CaffeFreeHost(void* ptr) { +#ifndef CPU_ONLY + if (Caffe::mode() == Caffe::GPU) { + CUDA_CHECK(cudaFreeHost(ptr)); + return; + } +#endif free(ptr); } |