Allocate host memory through cudaMallocHost

thanks to discussion by @thatguymike and @flx42
author: Cyprien Noel <cyprien.noel@gmail.com> 2015-05-18 20:07:36 -0700
committer: Evan Shelhamer <shelhamer@imaginarynumber.net> 2015-08-09 15:13:11 -0700
commit: d2f045768cba7d494abb4d168fc366d6fce80b85 (patch)
tree: 032c5f952fb5b97868557f320a365f306733a0e0 /include
parent: bcc8f50a95ecad954d1887f3fb273eaa298e2274 (diff)
download: caffeonacl-d2f045768cba7d494abb4d168fc366d6fce80b85.tar.gz
caffeonacl-d2f045768cba7d494abb4d168fc366d6fce80b85.tar.bz2
caffeonacl-d2f045768cba7d494abb4d168fc366d6fce80b85.zip
1 files changed, 17 insertions, 14 deletions
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 4d339bf4..4a1a2f3f 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -8,26 +8,29 @@
 
 namespace caffe {
 
-// Theoretically, CaffeMallocHost and CaffeFreeHost should simply call the
-// cudaMallocHost and cudaFree functions in order to create pinned memory.
-// However, those codes rely on the existence of a cuda GPU (I don't know
-// why that is a must since allocating memory should not be accessing the
-// GPU resource, but it just creates an error as of Cuda 5.0) and will cause
-// problem when running on a machine without GPU. Thus, we simply define
-// these two functions for safety and possible future change if the problem
-// of calling cuda functions disappears in a future version.
-//
-// In practice, although we are creating unpinned memory here, as long as we
-// are constantly accessing them the memory pages almost always stays in
-// the physical memory (assuming we have large enough memory installed), and
-// does not seem to create a memory bottleneck here.
-
+// If CUDA is available and in GPU mode, host memory will be allocated pinned,
+// using cudaMallocHost. It avoids dynamic pinning for transfers (DMA).
+// The improvement in performance seems negligible in the single GPU case,
+// but might be more significant for parallel training. Most importantly,
+// it improved stability for large models on many GPUs.
 inline void CaffeMallocHost(void** ptr, size_t size) {
+#ifndef CPU_ONLY
+  if (Caffe::mode() == Caffe::GPU) {
+    CUDA_CHECK(cudaMallocHost(ptr, size));
+    return;
+  }
+#endif
   *ptr = malloc(size);
   CHECK(*ptr) << "host allocation of size " << size << " failed";
 }
 
 inline void CaffeFreeHost(void* ptr) {
+#ifndef CPU_ONLY
+  if (Caffe::mode() == Caffe::GPU) {
+    CUDA_CHECK(cudaFreeHost(ptr));
+    return;
+  }
+#endif
   free(ptr);
 }
author	Cyprien Noel <cyprien.noel@gmail.com>	2015-05-18 20:07:36 -0700
committer	Evan Shelhamer <shelhamer@imaginarynumber.net>	2015-08-09 15:13:11 -0700
commit	d2f045768cba7d494abb4d168fc366d6fce80b85 (patch)
tree	032c5f952fb5b97868557f320a365f306733a0e0 /include
parent	bcc8f50a95ecad954d1887f3fb273eaa298e2274 (diff)
download	caffeonacl-d2f045768cba7d494abb4d168fc366d6fce80b85.tar.gz caffeonacl-d2f045768cba7d494abb4d168fc366d6fce80b85.tar.bz2 caffeonacl-d2f045768cba7d494abb4d168fc366d6fce80b85.zip