diff options
author | Yangqing Jia <jiayq84@gmail.com> | 2016-07-22 23:58:24 -0700 |
---|---|---|
committer | Yangqing Jia <jiayq84@gmail.com> | 2016-08-02 23:28:23 -0700 |
commit | 1ede7a7ff0fc163dbe54f138bb2287ff9cea6beb (patch) | |
tree | 1c7e3fc9582760af8a141f1c253777dbbfddb725 | |
parent | b2c2d0b70ccd4aae0e438a7864493f03ea8aa399 (diff) | |
download | pytorch-1ede7a7ff0fc163dbe54f138bb2287ff9cea6beb.tar.gz pytorch-1ede7a7ff0fc163dbe54f138bb2287ff9cea6beb.tar.bz2 pytorch-1ede7a7ff0fc163dbe54f138bb2287ff9cea6beb.zip |
more build updates:
(1) nccl submodule, cnmem submodule
(2) mpi ops fallback test
(3) a bit more blob interface
(4) fixed tests
(5) caffe2.python.io -> caffe2.python.dataio to avoid name conflicts
(6) In the build system autogen __init__.py instead of having manual
rules just to copy over an empty __init__.py.
-rw-r--r-- | .gitmodules | 6 | ||||
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | build.py | 1 | ||||
-rw-r--r-- | caffe/BREW | 4 | ||||
-rw-r--r-- | caffe/__init__.py | 0 | ||||
-rw-r--r-- | caffe/proto/BREW | 8 | ||||
-rw-r--r-- | caffe/proto/__init__.py | 0 | ||||
-rw-r--r-- | caffe2/BREW | 9 | ||||
-rw-r--r-- | caffe2/__init__.py | 5 | ||||
-rw-r--r-- | caffe2/contrib/nccl/BREW | 10 | ||||
-rw-r--r-- | caffe2/core/blob.h | 45 | ||||
-rw-r--r-- | caffe2/core/blob_test.cc | 27 | ||||
-rw-r--r-- | caffe2/core/context_gpu_test.cc | 3 | ||||
-rw-r--r-- | caffe2/mpi/mpi_ops.h | 6 | ||||
-rw-r--r-- | caffe2/operators/operator_fallback_gpu.h | 17 | ||||
-rw-r--r-- | caffe2/proto/BREW | 8 | ||||
-rw-r--r-- | caffe2/proto/__init__.py | 0 | ||||
-rw-r--r-- | caffe2/python/BREW | 4 | ||||
-rw-r--r-- | caffe2/python/__init__.py | 4 | ||||
-rw-r--r-- | caffe2/python/dataio.py (renamed from caffe2/python/io.py) | 0 | ||||
-rw-r--r-- | caffe2/python/dataset.py | 2 | ||||
-rw-r--r-- | caffe2/python/mint/BREW | 1 | ||||
-rw-r--r-- | caffe2/python/mint/__init__.py | 0 | ||||
-rw-r--r-- | third_party/BREW | 47 | ||||
m--------- | third_party/cnmem | 0 | ||||
-rw-r--r-- | third_party/cnmem/BREW | 24 | ||||
-rw-r--r-- | third_party/cnmem/cnmem.cpp | 1287 | ||||
-rw-r--r-- | third_party/cnmem/cnmem.h | 263 | ||||
m--------- | third_party/nccl | 6 |
29 files changed, 160 insertions, 1629 deletions
diff --git a/.gitmodules b/.gitmodules index 9bb357f579..d41d8385db 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,9 @@ [submodule "third_party/pybind11"] path = third_party/pybind11 url = https://github.com/pybind/pybind11.git +[submodule "third_party/nccl"] + path = third_party/nccl + url = https://github.com/nvidia/nccl.git +[submodule "third_party/cnmem"] + path = third_party/cnmem + url = https://github.com/nvidia/cnmem.git @@ -16,6 +16,6 @@ lint: @find caffe2 -type f -exec python brewtool/cpplint.py {} \; linecount: - @cloc --read-lang-def=brewtool/caffe.cloc caffe2 pycaffe2 || \ + @cloc --read-lang-def=brewtool/caffe.cloc caffe2 || \ echo "Cloc is not available on the machine. You can install cloc with " && \ echo " sudo apt-get install cloc" @@ -108,6 +108,7 @@ class Config(object): 'arch=compute_30,code=sm_30', 'arch=compute_35,code=sm_35', 'arch=compute_50,code=sm_50', + 'arch=compute_61,code=sm_61', ] # additional CUDA cflags to pass to nvcc. CUDA_CFLAGS = [] diff --git a/caffe/BREW b/caffe/BREW deleted file mode 100644 index 606b721d84..0000000000 --- a/caffe/BREW +++ /dev/null @@ -1,4 +0,0 @@ -filegroup( - name = "caffe_python", - srcs = ["__init__.py"], -) diff --git a/caffe/__init__.py b/caffe/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 --- a/caffe/__init__.py +++ /dev/null diff --git a/caffe/proto/BREW b/caffe/proto/BREW index 6acf48799f..60a2eded6d 100644 --- a/caffe/proto/BREW +++ b/caffe/proto/BREW @@ -4,11 +4,3 @@ proto_library( name = 'caffe_proto', srcs = ['caffe.proto'], ) - -filegroup( - name = "caffe_proto_py", - srcs = ["__init__.py"], - deps = [ - "//caffe:caffe_python", - ] -) diff --git a/caffe/proto/__init__.py b/caffe/proto/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 --- a/caffe/proto/__init__.py +++ /dev/null diff --git a/caffe2/BREW b/caffe2/BREW index 1fb133a44b..c7dcca9247 100644 --- a/caffe2/BREW +++ b/caffe2/BREW @@ -26,7 +26,7 @@ cc_library( deps = [ ":core", ":core_gpu_cu", - "//third_party/cnmem:cnmem", + "//third_party:cnmem", "//third_party:cuda", ], whole_archive = True, @@ -48,6 +48,7 @@ cc_test( excludes=["*gpu_test*"]), deps = [ ":core", + "//caffe2/operators:core_ops", "//third_party:gtest", "//caffe2/test:caffe2_gtest_main", ], @@ -63,11 +64,6 @@ cc_test( ], ) -filegroup( - name = "caffe2_python", - srcs = ["__init__.py"], -) - cc_library( name = "all_available_ops", srcs = [], @@ -79,6 +75,7 @@ cc_library( optional_deps = [ "//caffe2/operators:core_ops_gpu", "//caffe2/operators:core_ops_cudnn", + "//caffe2/contrib/nccl:nccl_ops", "//caffe2/cuda_rtc:rtc_ops", "//caffe2/db:db_gpu", "//caffe2/image:image_ops", diff --git a/caffe2/__init__.py b/caffe2/__init__.py deleted file mode 100644 index eb2c8353ac..0000000000 --- a/caffe2/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -""" -Caffe2: A General Tool for Neural Networks. -""" - -__author__ = 'Yangqing Jia' diff --git a/caffe2/contrib/nccl/BREW b/caffe2/contrib/nccl/BREW new file mode 100644 index 0000000000..efcf77f6df --- /dev/null +++ b/caffe2/contrib/nccl/BREW @@ -0,0 +1,10 @@ +cc_library( + name = "nccl_ops", + srcs = Glob(["*.cc"]), + hdrs = Glob(["*.h"]), + deps = [ + "//caffe2:core_gpu", + "//third_party:nccl", + ], + whole_archive = True, +) diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h index c99831d861..12b203f479 100644 --- a/caffe2/core/blob.h +++ b/caffe2/core/blob.h @@ -58,6 +58,9 @@ class Blob { return *static_cast<const T*>(pointer_); } + const void* GetRaw() const { return pointer_; } + void* GetRaw() { return pointer_; } + /** * @brief Gets a mutable pointer to the stored object. * @@ -73,6 +76,7 @@ class Blob { return static_cast<T*>(pointer_); } else { if (is_new_object) *is_new_object = true; + VLOG(1) << "Create new mutable object " << TypeMeta::Name<T>(); return Reset<T>(new T()); } } @@ -87,10 +91,9 @@ class Blob { */ template <class T> T* Reset(T* allocated) { - if (pointer_) { - CHECK_NOTNULL(destroy_)(pointer_); + if (pointer_ && destroy_) { + destroy_(pointer_); } - VLOG(1) << "Create new mutable object " << TypeMeta::Name<T>(); meta_ = TypeMeta::Make<T>(); pointer_ = static_cast<void*>(allocated); destroy_ = &Destroy<T>; @@ -98,17 +101,43 @@ class Blob { } /** + * Sets the underlying object to the allocated one, but does not take over + * the ownership of the passed in pointer. If there is already an object in + * the Blob, the old object is freed. + * + * Unlike Reset, this does not take over the ownership of the pointer and the + * caller is responsible for making sure that the lifetime of the allocated + * blob outlasts the lifetime of any access to this blob, until another Reset + * call is made or the blob is destructed. + */ + template <class T> + typename std::remove_const<T>::type* ShareExternal( + typename std::remove_const<T>::type* allocated) { + return static_cast<T*>( + ShareExternal(static_cast<void*>(allocated), + TypeMeta::Make<typename std::remove_const<T>::type>())); + } + + void* ShareExternal(void* allocated, const TypeMeta& meta) { + if (pointer_ && destroy_) { + destroy_(pointer_); + } + meta_ = meta; + pointer_ = static_cast<void*>(allocated); + destroy_ = nullptr; + return allocated; + } + + /** * Resets the Blob to an empty one. */ inline void Reset() { - if (pointer_) { - CHECK_NOTNULL(destroy_)(pointer_); - pointer_ = nullptr; - meta_ = TypeMeta(); - destroy_ = nullptr; + if (pointer_ && destroy_) { + destroy_(pointer_); } pointer_ = nullptr; meta_ = TypeMeta(); + destroy_ = nullptr; } /** diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc index c92e5edf4a..956f8623d5 100644 --- a/caffe2/core/blob_test.cc +++ b/caffe2/core/blob_test.cc @@ -69,6 +69,32 @@ TEST(BlobTest, BlobWrongType) { ASSERT_THROW(blob.Get<int>(), EnforceNotMet); } +TEST(BlobTest, BlobReset) { + Blob blob; + std::unique_ptr<Foo> foo(new Foo()); + EXPECT_TRUE(blob.Reset(foo.release()) != nullptr); + // Also test that Reset works. + blob.Reset(); +} + +TEST(BlobTest, BlobShareExternalPointer) { + Blob blob; + std::unique_ptr<Foo> foo(new Foo()); + EXPECT_EQ(blob.ShareExternal<Foo>(foo.get()), foo.get()); + EXPECT_TRUE(blob.IsType<Foo>()); + // Also test that Reset works. + blob.Reset(); +} + +TEST(BlobTest, BlobShareExternalObject) { + Blob blob; + Foo foo; + EXPECT_EQ(blob.ShareExternal<Foo>(&foo), &foo); + EXPECT_TRUE(blob.IsType<Foo>()); + // Also test that Reset works. + blob.Reset(); +} + TEST(BlobTest, StringSerialization) { const std::string kTestString = "Hello world?"; Blob blob; @@ -558,6 +584,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) { "DUMMY_ENGINE"); Workspace ws; auto load_op = CreateOperator(op_def, &ws); + EXPECT_TRUE(load_op != nullptr); LOG(INFO) << "Running operator"; load_op->Run(); diff --git a/caffe2/core/context_gpu_test.cc b/caffe2/core/context_gpu_test.cc index 69915188dc..b01284904c 100644 --- a/caffe2/core/context_gpu_test.cc +++ b/caffe2/core/context_gpu_test.cc @@ -1,3 +1,4 @@ +#include <chrono> #include <future> #include <random> #include <thread> @@ -55,6 +56,8 @@ namespace { void TEST_GetStreamAddress(cudaStream_t* ptr) { CUDAContext context(0); *ptr = context.cuda_stream(); + // Sleep for a while so we have concurrent thread executions + std::this_thread::sleep_for(std::chrono::seconds(1)); } } // namespace diff --git a/caffe2/mpi/mpi_ops.h b/caffe2/mpi/mpi_ops.h index 3ee1aa8d93..79e5dab537 100644 --- a/caffe2/mpi/mpi_ops.h +++ b/caffe2/mpi/mpi_ops.h @@ -35,9 +35,13 @@ class MPIBroadcastOp final : public Operator<Context> { bool RunOnDevice() override { MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm(); + CAFFE_ENFORCE(OperatorBase::OutputIsType<Tensor<Context>>(0), + "Output is of wrong type."); auto* output = Output(0); // Make sure that output is already allocated. - CHECK_GT(output->size(), 0); + CAFFE_ENFORCE(output->size() > 0, + "Broadcast op uses in-place operation so the output " + "should be already allocated."); MPI_CHECK(MPI_Bcast( output->raw_mutable_data(), output->nbytes(), diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h index ff9012d4fb..a9ca10c4cd 100644 --- a/caffe2/operators/operator_fallback_gpu.h +++ b/caffe2/operators/operator_fallback_gpu.h @@ -54,8 +54,18 @@ class GPUFallbackOp final : public Operator<CUDAContext> { bool RunOnDevice() override { for (int i = 0; i < InputSize(); ++i) { - local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom( - Input(i), &context_); + if (OperatorBase::InputIsType<TensorCUDA>(i)) { + local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom( + Input(i), &context_); + } else { + VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy."; + // Note(jiayq): This removes a const but conceptually + // local_input_blobs will only be used as const blob input for the + // base op so we are still fine. + local_input_blobs_[i]->ShareExternal( + const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()), + OperatorBase::Inputs()[i]->meta()); + } } // Sync to make sure copies are done. context_.FinishDeviceComputation(); @@ -65,6 +75,9 @@ class GPUFallbackOp final : public Operator<CUDAContext> { return false; } for (int i = 0; i < OutputSize(); ++i) { + CAFFE_ENFORCE(local_output_blobs_[i]->IsType<TensorCPU>(), + "GPU fallback op currently does not support non-TensorCPU " + "output type."); Output(i)->CopyFrom( local_output_blobs_[i]->template Get<TensorCPU>(), &context_); } diff --git a/caffe2/proto/BREW b/caffe2/proto/BREW index 132babed97..c2799cd6e4 100644 --- a/caffe2/proto/BREW +++ b/caffe2/proto/BREW @@ -4,11 +4,3 @@ proto_library( name = 'caffe2_proto', srcs = Glob(['*.proto']), ) - -filegroup( - name = "caffe2_proto_py", - srcs = ["__init__.py"], - deps = [ - "//caffe2:caffe2_python", - ] -) diff --git a/caffe2/proto/__init__.py b/caffe2/proto/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 --- a/caffe2/proto/__init__.py +++ /dev/null diff --git a/caffe2/python/BREW b/caffe2/python/BREW index 9b3b92d382..0f0afd6c8f 100644 --- a/caffe2/python/BREW +++ b/caffe2/python/BREW @@ -39,8 +39,8 @@ py_library( srcs=Glob(["*.py"], excludes=["*_test.py"]), deps=[ ":caffe2_python_cpu", - "//caffe/proto:caffe_proto_py", - "//caffe2/proto:caffe2_proto_py", + "//caffe/proto:caffe_proto", + "//caffe2/proto:caffe2_proto", "//caffe2/python/mint:mint", ], optional_deps=[ diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py deleted file mode 100644 index 8181f36637..0000000000 --- a/caffe2/python/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -import atexit - -from . import core, utils, workspace -from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/io.py b/caffe2/python/dataio.py index 89a61384ab..89a61384ab 100644 --- a/caffe2/python/io.py +++ b/caffe2/python/dataio.py diff --git a/caffe2/python/dataset.py b/caffe2/python/dataset.py index 513dd19d30..d1dd70ca84 100644 --- a/caffe2/python/dataset.py +++ b/caffe2/python/dataset.py @@ -14,7 +14,7 @@ from __future__ import print_function from __future__ import unicode_literals from caffe2.python import core, workspace -from caffe2.python.io import Reader, Writer +from caffe2.python.dataio import Reader, Writer from caffe2.python.schema import Struct import numpy as np diff --git a/caffe2/python/mint/BREW b/caffe2/python/mint/BREW index 24d72811a6..cb9be70d73 100644 --- a/caffe2/python/mint/BREW +++ b/caffe2/python/mint/BREW @@ -1,7 +1,6 @@ py_library( name = "mint", srcs = [ - "__init__.py", "app.py", "static/css/simple-sidebar.css", "templates/index.html", diff --git a/caffe2/python/mint/__init__.py b/caffe2/python/mint/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 --- a/caffe2/python/mint/__init__.py +++ /dev/null diff --git a/third_party/BREW b/third_party/BREW index 0cc77fa8d4..7f4ea5a9d0 100644 --- a/third_party/BREW +++ b/third_party/BREW @@ -92,10 +92,49 @@ cc_thirdparty_target( ], ) -cc_thirdparty_target( - name="cnmen", - deps=["//third_party/cnmem:cnmem"], - cc_obj_files = [], +shell_script( + name = "cnmem_header", + srcs = ["cnmem/include/cnmem.h"], + commands=[ + "DST=$CAFFE2_GENDIR/third_party/include/", + "mkdir -p $DST", + "cp $CAFFE2_SRCDIR/$CAFFE2_CWD/cnmem/include/cnmem.h $DST/", + ], +) + +cc_library( + name = "cnmem", + srcs = [ + "cnmem/src/cnmem.cpp", + ], + deps = [ + ":cnmem_header", + ":cuda", + ] +) + +shell_script( + name = "nccl_header", + srcs = ["nccl/src/nccl.h"], + commands=[ + "DST=$CAFFE2_GENDIR/third_party/include/", + "mkdir -p $DST", + "cp $CAFFE2_SRCDIR/$CAFFE2_CWD/nccl/src/nccl.h $DST/", + ], +) + +cuda_library( + name = "nccl", + srcs = Glob(["nccl/src/*.cu"]), + deps = [ + ":nccl_header", + ":cuda", + ], + compiler_flags=[ + "-Wno-switch", # NCCL does not follow strict switch enum check. + "-DNCCL_MAJOR=1 -DNCCL_MINOR=2 -DNCCL_PATCH=3", + "-DCUDA_MAJOR=__CUDACC_VER_MAJOR__ -DCUDA_MINOR=__CUDACC_VER_MINOR__", + ], ) ############################################################################### diff --git a/third_party/cnmem b/third_party/cnmem new file mode 160000 +Subproject 28a182d49529da49f4ac4e3941cec3edf16b354 diff --git a/third_party/cnmem/BREW b/third_party/cnmem/BREW deleted file mode 100644 index 684465db9c..0000000000 --- a/third_party/cnmem/BREW +++ /dev/null @@ -1,24 +0,0 @@ -# We need to copy over the header to the right folder. -shell_script( - name = "cnmem_header", - srcs = ["cnmem.h"], - commands=[ - "DST=$CAFFE2_GENDIR/third_party/include/", - "mkdir -p $DST", - "cp $CAFFE2_SRCDIR/$CAFFE2_CWD/cnmem.h $DST/", - ], -) - -cuda_library( - name = "cnmem", - srcs = [ - "cnmem.cpp", - ], - hdrs = [ - "cnmem.h", - ], - deps = [ - "cnmem_header", - "//third_party:cuda", - ] -) diff --git a/third_party/cnmem/cnmem.cpp b/third_party/cnmem/cnmem.cpp deleted file mode 100644 index db127fff5b..0000000000 --- a/third_party/cnmem/cnmem.cpp +++ /dev/null @@ -1,1287 +0,0 @@ -/////////////////////////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#include "cnmem.h" -#include <cstddef> -#include <vector> -#include <cuda_runtime_api.h> - -#if !defined(WIN32) && defined(_MSC_VER) -#define WIN32 -#endif - -#ifdef WIN32 -#include <Windows.h> -#else -#include <pthread.h> -#endif - -#define CNMEM_GRANULARITY 512 - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -extern "C" const char* cnmemGetErrorString(cnmemStatus_t status) { - switch(status) { - case CNMEM_STATUS_SUCCESS: return "CNMEM_STATUS_SUCCESS"; - case CNMEM_STATUS_CUDA_ERROR: return "CNMEM_STATUS_CUDA_ERROR"; - case CNMEM_STATUS_INVALID_ARGUMENT: return "CNMEM_STATUS_INVALID_ARGUMENT"; - case CNMEM_STATUS_NOT_INITIALIZED: return "CNMEM_STATUS_NOT_INITIALIZED"; - case CNMEM_STATUS_OUT_OF_MEMORY: return "CNMEM_STATUS_OUT_OF_MEMORY"; - default: return "CNMEM_STATUS_UNKNOWN_ERROR"; - } -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#if 0 -#ifdef WIN32 -#define CNMEM_DEBUG_ERROR(...) do { \ - fprintf(stderr, "Error at line: %d\n", __LINE__); \ - fprintf(stderr, __VA_ARGS__); \ -} while(0) -#else -#include <execinfo.h> -static inline void printBacktrace() { - void *stackBuffer[64]; - int numAddresses = backtrace((void**) &stackBuffer, 64); - char **addresses = backtrace_symbols(stackBuffer, numAddresses); - for( int i = 0 ; i < numAddresses ; ++i ) { - fprintf(stderr, "[%2d]: %s\n", i, addresses[i]); - } - free(addresses); -} -#define CNMEM_DEBUG_ERROR(...) do { \ - fprintf(stderr, "Error at line: %d\n", __LINE__); \ - fprintf(stderr, __VA_ARGS__); \ - fprintf(stderr, "Backtrace:\n"); \ - printBacktrace(); \ -} while(0) -#endif -#else -#define CNMEM_DEBUG_ERROR(...) -#endif - -#if 0 -#define CNMEM_DEBUG_INFO printf -#else -#define CNMEM_DEBUG_INFO(...) -#endif - -#if 0 // Enable/disable assertions -#include <cassert> -#define CNMEM_ASSERT assert -#else -#define CNMEM_ASSERT(...) -#endif - -#define CNMEM_CHECK_TRUE(cond, error) do { \ - if( !(cond) ) { \ - CNMEM_DEBUG_ERROR("CNMEM_CHECK_TRUE evaluates to false\n"); \ - return error; \ - } \ -} while(0) - -#define CNMEM_CHECK(call) do { \ - cnmemStatus_t status = (call); \ - if( status != CNMEM_STATUS_SUCCESS ) { \ - CNMEM_DEBUG_ERROR("CNMEM_CHECK failed with status \"%s\"\n", \ - cnmemGetErrorString(status)); \ - return status; \ - } \ -} while(0) - -#define CNMEM_CHECK_OR_UNLOCK(call, mutex) do { \ - cnmemStatus_t status = (call); \ - if( status != CNMEM_STATUS_SUCCESS ) { \ - CNMEM_DEBUG_ERROR("CNMEM_CHECK_OR_UNLOCK failed with status \"%s\"\n", \ - cnmemGetErrorString(status)); \ - (mutex).unlock(); \ - return status; \ - } \ -} while(0) - -#define CNMEM_CHECK_CUDA(call) do { \ - cudaError_t cudaError = (call); \ - if( cudaError == cudaErrorMemoryAllocation ) { \ - CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \ - cudaGetErrorString(cudaError)); \ - return CNMEM_STATUS_OUT_OF_MEMORY; \ - } \ - else if( cudaError != cudaSuccess ) { \ - CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \ - cudaGetErrorString(cudaError)); \ - return CNMEM_STATUS_CUDA_ERROR; \ - } \ -} while(0) - -#define CNMEM_CHECK_CUDA_OR_UNLOCK(call, mutex) do { \ - cudaError_t cudaError = (call); \ - if( cudaError == cudaErrorMemoryAllocation ) { \ - CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \ - cudaGetErrorString(cudaError)); \ - (mutex).unlock(); \ - return CNMEM_STATUS_OUT_OF_MEMORY; \ - } \ - else if( cudaError != cudaSuccess ) { \ - CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \ - cudaGetErrorString(cudaError)); \ - (mutex).unlock(); \ - return CNMEM_STATUS_CUDA_ERROR; \ - } \ -} while(0) - -#ifdef WIN32 -#define CNMEM_CHECK_WIN32(call, error_code) do { \ - SetLastError(0); /* Clean the flag. */ \ - call; \ - DWORD status = GetLastError(); \ - if( status ) \ - return error_code; \ -} while(0) -#else -#define CNMEM_CHECK_PTHREAD(call, error_code) do { \ - int status = call; \ - if( status ) { \ - CNMEM_DEBUG_ERROR("CNMEM_CHECK_PTHREAD failed with status %d\n", status); \ - return error_code; \ - } \ -} while(0) -#endif - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -namespace cnmem { - -static inline std::size_t ceilInt(std::size_t m, std::size_t n) { - CNMEM_ASSERT(n > 0); - return (m + n-1) / n * n; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -class Mutex { -#ifdef WIN32 - mutable CRITICAL_SECTION mCriticalSection; -#else - pthread_mutex_t mMutex; -#endif - -public: - /// Initialize the mutex. - cnmemStatus_t initialize(); - /// Finalize the mutex. - cnmemStatus_t finalize(); - /// Lock the mutex. - cnmemStatus_t lock() const; - /// Unlock the mutex. - cnmemStatus_t unlock() const; -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Mutex::initialize() { -#ifdef WIN32 - CNMEM_CHECK_WIN32(InitializeCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR); -#else -#if 0 - pthread_mutexattr_t attr; - CNMEM_CHECK_PTHREAD(pthread_mutexattr_init(&attr), CNMEM_STATUS_UNKNOWN_ERROR); - CNMEM_CHECK_PTHREAD(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE), CNMEM_STATUS_UNKNOWN_ERROR); - CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, &attr), CNMEM_STATUS_UNKNOWN_ERROR); -#else - CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, NULL), CNMEM_STATUS_UNKNOWN_ERROR); -#endif -#endif - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Mutex::finalize() { -#ifdef WIN32 - CNMEM_CHECK_WIN32(DeleteCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR); -#else - CNMEM_CHECK_PTHREAD(pthread_mutex_destroy(&mMutex), CNMEM_STATUS_UNKNOWN_ERROR); -#endif - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Mutex::lock() const { -#ifdef WIN32 - CNMEM_CHECK_WIN32(EnterCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR); -#else - CNMEM_CHECK_PTHREAD(pthread_mutex_lock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR); -#endif - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Mutex::unlock() const { -#ifdef WIN32 - CNMEM_CHECK_WIN32(LeaveCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR); -#else - CNMEM_CHECK_PTHREAD(pthread_mutex_unlock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR); -#endif - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -class Block { - /// The pointer to the memory region on the device. - char *mData; - /// The size of the memory buffer. - std::size_t mSize; - /// The prev/next blocks in the linked list of blocks. - Block *mNext; - /// Is it a head node (i.e. a node obtained from parent->allocate or cudaMalloc). - bool mIsHead; - -public: - /// Create a block. - Block(char *data, std::size_t size, Block *next, bool isHead) - : mData(data) - , mSize(size) - , mNext(next) - , mIsHead(isHead) { - } - - /// The data. - inline const char* getData() const { return mData; } - /// The data (mutable). - inline char* getData() { return mData; } - - /// The size of the block. - inline std::size_t getSize() const { return mSize; } - - /// The next block in the linked list. - inline const Block* getNext() const { return mNext; } - /// The next block in the linked list (mutable). - inline Block* getNext() { return mNext; } - - /// Is it a head block. - inline bool isHead() const { return mIsHead; } - - /// Change the next block. - inline void setNext(Block *next) { mNext = next; } - /// Change the size of the block. - inline void setSize(std::size_t size) { mSize = size; } - /// Set the head flag. - inline void setHeadFlag(bool isHead) { mIsHead = isHead; } -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -class Manager { - - /// The parent manager. - Manager *mParent; - /// The children managers. - std::vector<Manager*> mChildren; - /// The GPU device where the memory is allocated. - int mDevice; - /// The stream this manager is associated with. It could be NULL. - cudaStream_t mStream; - /// Is the stream blocking? - bool mIsStreamBlocking; - /// The list of used blocks. - Block *mUsedBlocks; - /// The list of free blocks. - Block *mFreeBlocks; - /// The managed memory size. - std::size_t mSize; - /// The flags. - unsigned mFlags; - /// To support multi-threading. Each manager has its own mutex. - Mutex mMutex; - -public: - /// Create an unitialized manager. - Manager(); - /// Dtor. - ~Manager(); - - /// Allocate a block of memory. - cnmemStatus_t allocate(void *&ptr, std::size_t size, bool isBlocking = true); - /// Release a block of memory. - cnmemStatus_t release(void *ptr); - /// Release memory. It returns true if we have no memory leak. - cnmemStatus_t releaseAllUnsafe(); - /// Reserve memory for a manager. - cnmemStatus_t reserve(std::size_t size); - /// Steal memory from another manager. - cnmemStatus_t stealUnsafe(void *&ptr, std::size_t size); - - /// Print the full memory state. - cnmemStatus_t printMemoryState(FILE *file) const; - - /// The amount of used memory. - inline cnmemStatus_t getUsedMemoryUnsafe(std::size_t &usedMemory) const { - return getMemoryUnsafe(usedMemory, mUsedBlocks); - } - /// The amount of used memory. - inline cnmemStatus_t getFreeMemoryUnsafe(std::size_t &freeMemory) const { - return getMemoryUnsafe(freeMemory, mFreeBlocks); - } - - /// Get a specific child based on the stream id. - cnmemStatus_t getChildFromStream(Manager *&manager, cudaStream_t stream) const; - /// Get a specific child based on the stream id. - cnmemStatus_t getChild(Manager *&manager, std::size_t i) const; - /// Add a new child. - cnmemStatus_t addChild(Manager *manager); - /// The number of children. - cnmemStatus_t getNumChildren(std::size_t &numChildren) const; - - /// The associated device. - inline int getDevice() const { return mDevice; } - /// The flags. - inline unsigned getFlags() const { return mFlags; } - /// Get the mutex. - inline const Mutex* getMutex() const { return &mMutex; } - /// The size allocated to that manager. - inline std::size_t getSize() const { return mSize; } - /// The CUDA stream. - inline cudaStream_t getStream() const { return mStream; } - - /// Define the parent. - inline void setParent(Manager *parent) { mParent = parent; } - /// Define the device. - inline void setDevice(int device) { mDevice = device; } - /// Define the stream. - inline cnmemStatus_t setStream(cudaStream_t stream) { - mStream = stream; -#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM - mIsStreamBlocking = false; -#elif CUDART_VERSION < 5050 - mIsStreamBlocking = true; -#else - unsigned flags = 0; - CNMEM_CHECK_CUDA(cudaStreamGetFlags(mStream, &flags)); - mIsStreamBlocking = !mStream || !(flags & cudaStreamNonBlocking); -#endif - return CNMEM_STATUS_SUCCESS; - } - /// Define the flags. - inline void setFlags(unsigned flags) { mFlags = flags; } - -private: - /// The member functions below which are marked "Unsafe" are not thread-safe when called on a - /// same Manager object. Make sure they are called by a single thread in that case. - - /// Allocate a new block and add it to the free list. - cnmemStatus_t allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size); - /// Release a block from the active list. - cnmemStatus_t releaseBlockUnsafe(Block *curr, Block *prev); - /// Find the best free node based on the size. - cnmemStatus_t findBestBlockUnsafe(Block *&curr, Block *&prev, std::size_t size); - /// Extract a node from the list of free blocks. - cnmemStatus_t extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen); - - /// Give a free block from that manager. - cnmemStatus_t giveBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size); - /// Steal a block from another manager. - cnmemStatus_t stealBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size); - - /// The memory consumption of a list. - cnmemStatus_t getMemoryUnsafe(std::size_t &memSize, const Block *head) const; - /// Print an internal linked list. - cnmemStatus_t printListUnsafe(FILE *file, const char *name, const Block *head) const; -}; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -Manager::Manager() - : mParent(NULL) - , mChildren() - , mDevice(-1) - , mStream(NULL) - , mIsStreamBlocking(false) - , mUsedBlocks(NULL) - , mFreeBlocks(NULL) - , mSize(0) - , mFlags(CNMEM_FLAGS_DEFAULT) - , mMutex() { - - mMutex.initialize(); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -Manager::~Manager() { - if( mDevice == -1 || cudaSetDevice(mDevice) != cudaSuccess ) { // Invalid device, skip it. - return; - } - releaseAllUnsafe(); - mMutex.finalize(); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::addChild(Manager *manager) { - CNMEM_CHECK(mMutex.lock()); - mChildren.push_back(manager); - CNMEM_CHECK(mMutex.unlock()); - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::allocate(void *&ptr, std::size_t size, bool isBlocking) { - CNMEM_CHECK(mMutex.lock()); - - // If the client is not blocking, we have to explicitly synchronize before giving one buffer. - if( !isBlocking ) { - CNMEM_CHECK_CUDA_OR_UNLOCK(cudaStreamSynchronize(mStream), mMutex); - } - - // Find the best fit. - Block *best = NULL, *prev = NULL; - CNMEM_CHECK_OR_UNLOCK(findBestBlockUnsafe(best, prev, size), mMutex); - - // If there's no block left in the list of free blocks (with a sufficient size). Request a new block. - if( best == NULL && !(mFlags & CNMEM_FLAGS_CANNOT_GROW) ) { - CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(best, prev, size), mMutex); - } - - // Make sure we do have a block or quit. - if( !best ) { - ptr = NULL; - CNMEM_CHECK(mMutex.unlock()); - return CNMEM_STATUS_OUT_OF_MEMORY; - } - - // Split the free block if needed. - CNMEM_CHECK_OR_UNLOCK(extractBlockUnsafe(best, prev, size, false), mMutex); - - // Push the node to the list of used nodes. - best->setNext(mUsedBlocks); - mUsedBlocks = best; - - // Return the new pointer into memory. - ptr = mUsedBlocks->getData(); - CNMEM_CHECK(mMutex.unlock()); - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size) { - // Reset the outputs. - curr = prev = NULL; - - // Try to allocate data from the parent or the device. - void *data = NULL; - if( mParent ) { - CNMEM_CHECK(mParent->allocate(data, size, mIsStreamBlocking)); - } - else { - CNMEM_DEBUG_INFO("cudaMalloc(%lu)\n", size); - CNMEM_CHECK_CUDA(cudaMalloc(&data, size)); - CNMEM_DEBUG_INFO(">> returned address=0x%016lx\n", (size_t) data); - } - - // If it failed, there's an unexpected issue. - CNMEM_ASSERT(data); - - // We have data, we now need to add it to the list of free nodes. We keep the list sorted. - Block *next = mFreeBlocks; - for( ; next && next->getData() < data ; next = next->getNext() ) { - prev = next; - } - curr = new Block((char*) data, size, next, true); - if( !curr ) { - return CNMEM_STATUS_OUT_OF_MEMORY; - } - if( prev ) { - prev->setNext(curr); - } - else { - mFreeBlocks = curr; - } - - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen) { - // We have two cases: 1/ It is the right size so we keep it or 2/ it is too large and we split the node. - Block *next; - if( curr->getSize() == size ) { - next = curr->getNext(); - } - else { - std::size_t remaining = curr->getSize()-size; - Block *newBlock = new Block(curr->getData() + size, remaining, curr->getNext(), stolen); - if( !newBlock ) { - return CNMEM_STATUS_OUT_OF_MEMORY; - } - next = newBlock; - curr->setSize(size); - } - - // Redo the "branching" in the nodes. - if( prev ) { - prev->setNext(next); - } - else { - mFreeBlocks = next; - } - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::findBestBlockUnsafe(Block *&best, Block *&prev, std::size_t size) { - best = NULL, prev = NULL; - for( Block *temp = mFreeBlocks, *tempPrev = NULL ; temp ; temp = temp->getNext() ) { - if( temp->getSize() >= size && (!best || temp->getSize() < best->getSize()) ) { - best = temp; - prev = tempPrev; - } - tempPrev = temp; - } - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::getChildFromStream(Manager *&manager, cudaStream_t stream) const { - CNMEM_CHECK(mMutex.lock()); - std::size_t i = 0, numChildren = mChildren.size(); - for( ; i < numChildren ; ++i ) { - if( mChildren[i]->mStream == stream ) { - manager = mChildren[i]; - break; - } - } - CNMEM_CHECK(mMutex.unlock()); - return i < numChildren ? CNMEM_STATUS_SUCCESS : CNMEM_STATUS_INVALID_ARGUMENT; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::getChild(Manager *&manager, std::size_t i) const { - CNMEM_CHECK(mMutex.lock()); - if( i >= mChildren.size() ) { - CNMEM_CHECK(mMutex.unlock()); - return CNMEM_STATUS_INVALID_ARGUMENT; - } - manager = mChildren[i]; - - CNMEM_CHECK(mMutex.unlock()); - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::getMemoryUnsafe(std::size_t &size, const Block *head) const { - size = 0; - for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) { - size += curr->getSize(); - } - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -#if 0 -cnmemStatus_t Manager::getMemory(std::size_t &size, const Block *head) const { - CNMEM_CHECK(mMutex.lock()); - CNMEM_CHECK_OR_UNLOCK(getMemoryUnsafe(size, head)); - CNMEM_CHECK(mMutex.unlock()); - return status; -} -#endif - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::getNumChildren(std::size_t &numChildren) const { - CNMEM_CHECK(mMutex.lock()); - numChildren = mChildren.size(); - CNMEM_CHECK(mMutex.unlock()); - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::giveBlockUnsafe(void *&blockData, std::size_t &blockSize, std::size_t size) { - // Make sure the block is not in use any more. It could be too coarse grain and we may change - // it in the future. - CNMEM_CHECK_CUDA(cudaStreamSynchronize(mStream)); - - // Init the returned values to 0. - blockData = NULL; - blockSize = 0; - - // Find the best node to steal and reserve it. - Block *best = NULL, *prev = NULL; - CNMEM_CHECK(findBestBlockUnsafe(best, prev, size)); - if( !best ) { - return CNMEM_STATUS_OUT_OF_MEMORY; - } - CNMEM_CHECK(extractBlockUnsafe(best, prev, size, true)); - blockData = best->getData(); - blockSize = best->getSize(); - - // Release the memory used by that block. - delete best; - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::printListUnsafe(FILE *file, const char *name, const Block *head) const { - std::size_t size = 0; - for( Block *curr = (Block*) head; curr; curr = curr->getNext() ) { - size += curr->getSize(); - } - fprintf(file, "| list=\"%s\", size=%lu\n", name, size); - for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) { - fprintf(file, "| | node=0x%016lx, data=0x%016lx, size=%lu, next=0x%016lx, head=%2lu\n", - (std::size_t) curr, - (std::size_t) curr->getData(), - (std::size_t) curr->getSize(), - (std::size_t) curr->getNext(), - (std::size_t) curr->isHead ()); - } - fprintf(file, "|\n"); - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::printMemoryState(FILE *file) const { - CNMEM_CHECK(mMutex.lock()); - std::size_t streamCode = (std::size_t) mStream; - std::size_t usedMemory, freeMemory; - CNMEM_CHECK_OR_UNLOCK(getUsedMemoryUnsafe(usedMemory), mMutex); - CNMEM_CHECK_OR_UNLOCK(getFreeMemoryUnsafe(freeMemory), mMutex); - - fprintf(file, ">> [%s] device=%d, stream=0x%016lx, used=%luB, free=%luB\n", - mParent ? "child" : "root", - mDevice, - streamCode, - usedMemory, - freeMemory); - CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "used", mUsedBlocks), mMutex); - CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "free", mFreeBlocks), mMutex); - fprintf(file, "\n"); - CNMEM_CHECK(mMutex.unlock()); - - if( mParent ) { - CNMEM_CHECK(mParent->printMemoryState(file)); - } - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::release(void *ptr) { - // Skip if ptr is NULL. - if( ptr == NULL ) { - return CNMEM_STATUS_SUCCESS; - } - - // Lock to make sure only one thread execute that fragment of code. - CNMEM_CHECK(mMutex.lock()); - - // Find the node in the list of used blocks. - Block *curr = mUsedBlocks, *prev = NULL; - for( ; curr && curr->getData() != ptr ; curr = curr->getNext() ) { - prev = curr; - } - - // Make sure we have found a node. - if( curr == NULL ) { - CNMEM_CHECK(mMutex.unlock()); - return CNMEM_STATUS_INVALID_ARGUMENT; - } - - // We have the node so release it. - cnmemStatus_t result = releaseBlockUnsafe(curr, prev); - CNMEM_CHECK(mMutex.unlock()); - return result; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::releaseAllUnsafe() { - // Destroy the children if any. - for( std::size_t i = 0; i < mChildren.size(); ++i ) { - Manager *child = mChildren[i]; - CNMEM_CHECK(child->releaseAllUnsafe()); - delete child; - } - mChildren.clear(); - - // Destroy used blocks. It's a kind of panic mode to avoid leaks. NOTE: Do that only with roots!!! - if( !mParent ) { - while( mUsedBlocks ) { - CNMEM_CHECK(releaseBlockUnsafe(mUsedBlocks, NULL)); - } - } - - // We should be having only free blocks that are head blocks. Release those blocks. - while( mFreeBlocks ) { - if( mParent ) { - CNMEM_CHECK(mParent->release(mFreeBlocks->getData())); - } - else if( mFreeBlocks->isHead() ) { - void *data = mFreeBlocks->getData(); - CNMEM_DEBUG_INFO("cudaFree(%lu, 0x%016lx)\n", mFreeBlocks->getSize(), (size_t) data); - CNMEM_CHECK_CUDA(cudaFree(data)); - CNMEM_DEBUG_INFO(">> success\n"); - } - Block *block = mFreeBlocks; - mFreeBlocks = mFreeBlocks->getNext(); - delete block; - } - - // We shouldn't have any used block left. Or, it means the user is causing memory leaks! - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::releaseBlockUnsafe(Block *curr, Block *prev) { - // The current node cannot be NULL! - CNMEM_ASSERT(curr != NULL); - - // Change the connection of the node. - if( prev ) { - prev->setNext(curr->getNext()); - } - else { - mUsedBlocks = curr->getNext(); - } - - // Find the location where this block should be added to the free list. - prev = NULL; - Block *iter = mFreeBlocks; - for( ; iter && iter->getData() < curr->getData() ; iter = iter->getNext() ) { - prev = iter; - } - - // Keep track of the successor of pred. We may lose track of it in the following "else". - Block *next = prev ? prev->getNext() : mFreeBlocks; - - // We first check if we can merge the block with its predecessor in the list and curr can be merged. - if( prev && prev->getData() + prev->getSize() == curr->getData() && !curr->isHead() ) { - prev->setSize(prev->getSize() + curr->getSize()); - delete curr; - curr = prev; - } - else if( prev ) { - prev->setNext(curr); - } - else { - mFreeBlocks = curr; - } - - // Check if we can merge curr and next. We can't merge over "cudaMalloc" boundaries. - if( next && curr->getData() + curr->getSize() == next->getData() && !next->isHead() ) { - curr->setSize(curr->getSize() + next->getSize()); - curr->setNext(next->getNext()); - delete next; - } - else { - curr->setNext(next); - } - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::reserve(std::size_t size) { - CNMEM_CHECK(mMutex.lock()); - Block *curr, *prev; - CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(curr, prev, size), mMutex); - mSize = size; - CNMEM_CHECK(mMutex.unlock()); - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::stealUnsafe(void *&stolen, std::size_t size) { - // If we cannot steal, don't even try. - if( mFlags & CNMEM_FLAGS_CANNOT_STEAL ) { - stolen = NULL; - return CNMEM_STATUS_INVALID_ARGUMENT; - } - - // The stolen block. - void *data = NULL; std::size_t dataSize = 0; - if( !mChildren.empty() ) { - CNMEM_CHECK(stealBlockUnsafe(data, dataSize, size)); - } - else if( mParent ) { - CNMEM_CHECK(mParent->stealBlockUnsafe(data, dataSize, size)); - } - - // Make sure we do have a block of memory or quit. - if( !data ) { - stolen = NULL; - return CNMEM_STATUS_OUT_OF_MEMORY; - } - - // Push the block in the used list. - mUsedBlocks = new Block((char*) data, dataSize, mUsedBlocks, true); - if( !mUsedBlocks ) { - return CNMEM_STATUS_OUT_OF_MEMORY; - } - - // Return the new pointer into memory. - stolen = data; - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Manager::stealBlockUnsafe(void *&data, std::size_t &dataSize, ::size_t size) { - // No block found and no room to grow. Try to steal from a children (if we have any). - data = NULL; - for( std::size_t i = 0 ; !data && i < mChildren.size() ; ++i ) { - Manager *child = mChildren[i]; - if( child->giveBlockUnsafe(data, dataSize, size) == CNMEM_STATUS_SUCCESS ) { - break; - } - } - - // If no memory space found, simply return NULL. We have failed to allocate. Quit miserably. - if( !data ) { - return CNMEM_STATUS_OUT_OF_MEMORY; - } - - // We have got a node from a children. We need to update our "used" list before we can do - // anything with it. - Block *curr = mUsedBlocks, *prev = NULL; - for( ; curr ; curr = curr->getNext() ) { - if( curr->getData() <= data && data < curr->getData()+curr->getSize() ) { - break; - } - prev = curr; - } - - // Curr points to the node which contains that memory region. - CNMEM_ASSERT(curr); - - // If it is exactly the same memory region, we are done!!! - if( curr->getData() == data && curr->getSize() == dataSize ) { - return CNMEM_STATUS_SUCCESS; - } - - // Track the blocks before and after curr. - Block *next = curr->getNext(); - - // We may have up to 3 blocks. - std::size_t sizeBefore = (std::size_t) ((char*) data - curr->getData()); - std::size_t sizeAfter = (curr->getSize() - sizeBefore - dataSize); - - // The resulting block. - Block *result = curr; - - // If we have no space between curr->getData and block->getData. - if( sizeBefore == 0 ) { - curr->setSize(dataSize); - } - else { - curr->setSize(sizeBefore); - Block *block = new Block((char*) data, dataSize, next, false); - if( !block ) { - return CNMEM_STATUS_OUT_OF_MEMORY; - } - curr->setNext(block); - curr = block; - data = (char*) data + dataSize; - dataSize = sizeAfter; - result = block; - } - - // We have space at the end so we may need to add a new node. - if( sizeAfter > 0 ) { - Block *block = new Block(curr->getData() + curr->getSize(), sizeAfter, next, false); - if( !block ) { - return CNMEM_STATUS_OUT_OF_MEMORY; - } - curr->setNext(block); - curr = block; - } - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -class Context { - /// Use a magic number to specify that the context is valid. - enum { CTX_VALID = 0x1f5632a3 }; - - /// The reference counting mechanism. - int mRefCount; - /// The mutex to increase/decrease the reference counter. TODO: Use atomics. - Mutex mMutex; - /// The memory managers. - std::vector<Manager> mManagers; - /// The global context. - static Context *sCtx; - /// Use a magic number to specify that the context was created. - static int sCtxCheck; - -public: - /// Ctor. - Context() : mRefCount(1) { mMutex.initialize(); } - /// Dtor. - ~Context(); - /// Get the managers. - inline std::vector<Manager>& getManagers() { return mManagers; } - /// Get a single manager associated with a device. - inline Manager& getManager(int i) { return mManagers[i]; } - - /// Create the global context. - static cnmemStatus_t create(); - /// Check that the context was created. - static inline bool check() { return sCtxCheck == CTX_VALID && sCtx; } - /// Get the global context. - static Context* get(); - /// Retain. - static cnmemStatus_t retain(); - /// Release. - static cnmemStatus_t release(); -}; - -Context *Context::sCtx; -int Context::sCtxCheck; - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -Context::~Context() { - int oldDevice; - cudaGetDevice(&oldDevice); - for( std::size_t i = 0 ; i < mManagers.size() ; ++i ) { - if( mManagers[i].getDevice() != -1 ) { // Skip invalid managers. - cudaSetDevice(mManagers[i].getDevice()); - mManagers[i].releaseAllUnsafe(); - } - } - mManagers.clear(); - mMutex.finalize(); - cudaSetDevice(oldDevice); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Context::create() { - sCtx = new Context; - sCtxCheck = CTX_VALID; - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -Context* Context::get() { - CNMEM_ASSERT(Context::check()); - return Context::sCtx; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Context::retain() { - CNMEM_CHECK(sCtx->mMutex.lock()); - sCtx->mRefCount++; - CNMEM_CHECK(sCtx->mMutex.unlock()); - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t Context::release() { - CNMEM_CHECK(sCtx->mMutex.lock()); - int refCount = --sCtx->mRefCount; - CNMEM_CHECK(sCtx->mMutex.unlock()); - - if( refCount == 0 ) { // Kill the context. - delete sCtx; - Context::sCtx = NULL; - Context::sCtxCheck = 0; - } - return CNMEM_STATUS_SUCCESS; -} - -} // namespace cnmem - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -extern "C" { - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags) { - // Make sure we have at least one device declared. - CNMEM_CHECK_TRUE(numDevices > 0, CNMEM_STATUS_INVALID_ARGUMENT); - - // Find the largest ID of the device. - int maxDevice = 0; - for( int i = 0 ; i < numDevices ; ++i ) { - if( devices[i].device > maxDevice ) { - maxDevice = devices[i].device; - } - } - - // Create the global context. - cnmem::Context::create(); - cnmem::Context *ctx = cnmem::Context::get(); - - // Allocate enough managers. - CNMEM_CHECK_TRUE(maxDevice >= 0, CNMEM_STATUS_INVALID_ARGUMENT); - std::vector<cnmem::Manager> &managers = ctx->getManagers(); - managers.resize(maxDevice+1); - - // Create a root manager for each device and create the children. - int oldDevice; - CNMEM_CHECK_CUDA(cudaGetDevice(&oldDevice)); - for( int i = 0 ; i < numDevices ; ++i ) { - CNMEM_CHECK_CUDA(cudaSetDevice(devices[i].device)); - std::size_t size = devices[i].size; - if( size == 0 ) { - cudaDeviceProp props; - CNMEM_CHECK_CUDA(cudaGetDeviceProperties(&props, devices[i].device)); - size = props.totalGlobalMem / 2; - } - CNMEM_CHECK_TRUE(size > 0, CNMEM_STATUS_INVALID_ARGUMENT); - - cnmem::Manager &manager = ctx->getManager(devices[i].device); - manager.setDevice(devices[i].device); - manager.setFlags(flags); - - size = cnmem::ceilInt(size, CNMEM_GRANULARITY); - CNMEM_CHECK(manager.reserve(size)); - - for( int j = 0 ; j < devices[i].numStreams ; ++j ) { - cnmem::Manager *child = new cnmem::Manager; - child->setParent(&manager); - child->setDevice(devices[i].device); - child->setStream(devices[i].streams[j]); - child->setFlags(flags & ~CNMEM_FLAGS_CANNOT_GROW); - if( devices[i].streamSizes && devices[i].streamSizes[j] > 0 ) { - CNMEM_CHECK(child->reserve(devices[i].streamSizes[j])); - } - CNMEM_CHECK(manager.addChild(child)); - } - } - CNMEM_CHECK_CUDA(cudaSetDevice(oldDevice)); - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t cnmemFinalize() { - CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED); - return cnmem::Context::release(); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t cnmemRetain() { - CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED); - return cnmem::Context::retain(); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t cnmemRelease() { - CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED); - return cnmem::Context::release(); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t cnmemRegisterStream(cudaStream_t stream) { - CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED); - CNMEM_CHECK_TRUE(stream, CNMEM_STATUS_INVALID_ARGUMENT); - - int device; - CNMEM_CHECK_CUDA(cudaGetDevice(&device)); - - cnmem::Manager &root = cnmem::Context::get()->getManager(device); - cnmem::Manager *child = new cnmem::Manager; - child->setParent(&root); - child->setDevice(device); - child->setStream(stream); - child->setFlags(root.getFlags() & ~CNMEM_FLAGS_CANNOT_GROW); - root.addChild(child); - - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t cnmemMalloc(void **ptr, std::size_t size, cudaStream_t stream) { - CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED); - if( !ptr && !size ) { - return CNMEM_STATUS_SUCCESS; - } - else if( !size ) { - ptr[0] = NULL; - return CNMEM_STATUS_SUCCESS; - } - CNMEM_CHECK_TRUE(ptr, CNMEM_STATUS_INVALID_ARGUMENT); - - int device; - CNMEM_CHECK_CUDA(cudaGetDevice(&device)); - - cnmem::Manager &root = cnmem::Context::get()->getManager(device); - cnmem::Manager *manager = &root; - if( stream ) { - CNMEM_CHECK(root.getChildFromStream(manager, stream)); - } - CNMEM_ASSERT(manager); - - size = cnmem::ceilInt(size, CNMEM_GRANULARITY); - cnmemStatus_t result = manager->allocate(ptr[0], size); - - // We failed to allocate but there might still be a buffer available in another manager. Try to - // steal it. - if( result == CNMEM_STATUS_OUT_OF_MEMORY ) { - - // Try to acquire locks on all the children. - std::size_t numChildren; - CNMEM_CHECK(root.getNumChildren(numChildren)); - std::vector<const cnmem::Mutex*> mutexes(numChildren); - - std::size_t numLocked = 0; - for( size_t i = 0 ; i < numChildren ; ++i, ++numLocked ) { - cnmem::Manager *child; - CNMEM_CHECK(root.getChild(child, i)); - mutexes[numLocked] = child->getMutex(); - if( mutexes[numLocked]->lock() != CNMEM_STATUS_SUCCESS ) { - break; - } - } - - // One lock failed, quit. Reduce the damage as much as possible, though. - if( numLocked != numChildren ) { - for( std::size_t i = 0 ; i < numLocked ; ++i ) { - (void) mutexes[i]->unlock(); - } - return CNMEM_STATUS_UNKNOWN_ERROR; - } - - // Grab the lock on the root, first. - const cnmem::Mutex *rootMutex = root.getMutex(); - CNMEM_CHECK(rootMutex->lock()); - - // We acquired all the lock so we try to steal a node from another child. - if( numLocked == mutexes.size() ) { - result = manager->stealUnsafe(ptr[0], size); - } - for( std::size_t i = 0 ; i < numLocked ; ++i ) { - cnmemStatus_t lockStatus = mutexes[i]->unlock(); - if( lockStatus != CNMEM_STATUS_SUCCESS ) { - // Starting from now we are panicking!!! One lock failed to be released, we try - // we others. We could also give up because we are already screwed. I don't know - // what's best! Comment are welcome. - result = lockStatus; - } - } - CNMEM_CHECK(rootMutex->unlock()); - } - return result; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t cnmemFree(void *ptr, cudaStream_t stream) { - CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED); - if( ptr == NULL ) { - return CNMEM_STATUS_SUCCESS; - } - - int device; - CNMEM_CHECK_CUDA(cudaGetDevice(&device)); - - cnmem::Manager &root = cnmem::Context::get()->getManager(device); - cnmem::Manager *manager = &root; - if( stream ) { - CNMEM_CHECK(root.getChildFromStream(manager, stream)); - } - CNMEM_ASSERT(manager); - return manager->release(ptr); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream) { - CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED); - CNMEM_CHECK_TRUE(totalMem && freeMem, CNMEM_STATUS_INVALID_ARGUMENT); - - int device; - CNMEM_CHECK_CUDA(cudaGetDevice(&device)); - cnmem::Manager &root = cnmem::Context::get()->getManager(device); - cnmem::Manager *manager = &root; - if( stream ) { - CNMEM_CHECK(root.getChildFromStream(manager, stream)); - } - CNMEM_ASSERT(manager); - - const cnmem::Mutex *mutex = manager->getMutex(); - CNMEM_CHECK(mutex->lock()); - CNMEM_CHECK_OR_UNLOCK(manager->getFreeMemoryUnsafe(*freeMem), *mutex); - size_t usedMem; - CNMEM_CHECK_OR_UNLOCK(manager->getUsedMemoryUnsafe(usedMem), *mutex); - CNMEM_CHECK(mutex->unlock()); - totalMem[0] = usedMem + freeMem[0]; - return CNMEM_STATUS_SUCCESS; -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -cnmemStatus_t cnmemPrintMemoryState(FILE *file, cudaStream_t stream) { - CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED); - - int device; - CNMEM_CHECK_CUDA(cudaGetDevice(&device)); - cnmem::Manager &root = cnmem::Context::get()->getManager(device); - cnmem::Manager *manager = &root; - if( stream ) { - CNMEM_CHECK(root.getChildFromStream(manager, stream)); - } - CNMEM_ASSERT(manager); - return manager->printMemoryState(file); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -} // extern "C" - diff --git a/third_party/cnmem/cnmem.h b/third_party/cnmem/cnmem.h deleted file mode 100644 index 19fd2f8402..0000000000 --- a/third_party/cnmem/cnmem.h +++ /dev/null @@ -1,263 +0,0 @@ -/* ********************************************************************** - * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * ********************************************************************** */ -#pragma once - -#ifdef __cplusplus -#include "cstdio" -#else -#include "stdio.h" -#endif -#include "cuda_runtime_api.h" - -#if defined(_MSC_VER) || defined(WIN32) -#ifdef CNMEM_DLLEXPORT -#define CNMEM_API __declspec(dllexport) -#else -#define CNMEM_API __declspec(dllimport) -#endif -#else -#ifdef CNMEM_DLLEXPORT -#define CNMEM_API __attribute__((visibility ("default"))) -#else -#define CNMEM_API -#endif -#endif - -#define CNMEM_VERSION 100 // It corresponds to 1.0.0 - -#ifdef __cplusplus -extern "C" { -#endif - -/* ********************************************************************************************* */ - -typedef enum -{ - CNMEM_STATUS_SUCCESS = 0, - CNMEM_STATUS_CUDA_ERROR, - CNMEM_STATUS_INVALID_ARGUMENT, - CNMEM_STATUS_NOT_INITIALIZED, - CNMEM_STATUS_OUT_OF_MEMORY, - CNMEM_STATUS_UNKNOWN_ERROR -} cnmemStatus_t; - -/* ********************************************************************************************* */ - -typedef enum -{ - CNMEM_FLAGS_DEFAULT = 0, /// Default flags. - CNMEM_FLAGS_CANNOT_GROW = 1, /// Prevent the manager from growing its memory consumption. - CNMEM_FLAGS_CANNOT_STEAL = 2, /// Prevent the manager from stealing memory. -} cnmemManagerFlags_t; - -/* ********************************************************************************************* */ - -typedef struct cnmemDevice_t_ -{ - /** The device number. */ - int device; - /** The size to allocate for that device. If 0, the implementation chooses the size. */ - size_t size; - /** The number of named streams associated with the device. The NULL stream is not counted. */ - int numStreams; - /** The streams associated with the device. It can be NULL. The NULL stream is managed. */ - cudaStream_t *streams; - /** The size reserved for each streams. It can be 0. */ - size_t *streamSizes; - -} cnmemDevice_t; - -/** - * \brief Initialize the library and allocate memory on the listed devices. - * - * For each device, an internal memory manager is created and the specified amount of memory is - * allocated (it is the size defined in device[i].size). For each, named stream an additional - * memory manager is created. Currently, it is implemented as a tree of memory managers: A root - * manager for the device and a list of children, one for each named stream. - * - * This function must be called before any other function in the library. It has to be called - * by a single thread since it is not thread-safe. - * - * \return - * CNMEM_STATUS_SUCCESS, if everything goes fine, - * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid, - * CNMEM_STATUS_OUT_OF_MEMORY, if the requested size exceeds the available memory, - * CNMEM_STATUS_CUDA_ERROR, if an error happens in a CUDA function. - */ -cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags); - -/** - * \brief Release all the allocated memory. - * - * This function must be called by a single thread and after all threads that called - * cnmemMalloc/cnmemFree have joined. This function is not thread-safe. - * - * \return - * CNMEM_STATUS_SUCCESS, if everything goes fine, - * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, - * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. - */ -cnmemStatus_t CNMEM_API cnmemFinalize(); - -/** - * \brief Increase the internal reference counter of the context object. - * - * This function increases the internal reference counter of the library. The purpose of that - * reference counting mechanism is to give more control to the user over the lifetime of the - * library. It is useful with scoped memory allocation which may be destroyed in a final - * memory collection after the end of main(). That function is thread-safe. - * - * \return - * CNMEM_STATUS_SUCCESS, if everything goes fine, - * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, - */ -cnmemStatus_t CNMEM_API cnmemRetain(); - -/** - * \brief Decrease the internal reference counter of the context object. - * - * This function decreases the internal reference counter of the library. The purpose of that - * reference counting mechanism is to give more control to the user over the lifetime of the - * library. It is useful with scoped memory allocation which may be destroyed in a final - * memory collection after the end of main(). That function is thread-safe. - * - * You can use \c cnmemRelease to explicitly finalize the library. - * - * \return - * CNMEM_STATUS_SUCCESS, if everything goes fine, - * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, - */ -cnmemStatus_t CNMEM_API cnmemRelease(); - -/** - * \brief Add a new stream to the pool of managed streams on a device. - * - * This function registers a new stream into a device memory manager. It is thread-safe. - * - * \return - * CNMEM_STATUS_SUCCESS, if everything goes fine, - * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid, - */ -cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream); - -/** - * \brief Allocate memory. - * - * This function allocates memory and initializes a pointer to device memory. If no memory - * is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe. - * - * The behavior of that function is the following: - * - * - If the stream is NULL, the root memory manager is asked to allocate a buffer of device - * memory. If there's a buffer of size larger or equal to the requested size in the list of - * free blocks, it is returned. If there's no such buffer but the manager is allowed to grow - * its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls - * cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not - * allowed to grow, the manager attempts to steal memory from one of its children (unless - * CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns - * CNMEM_STATUS_OUT_OF_MEMORY. - * - * - If the stream is a named stream, the initial request goes to the memory manager associated - * with that stream. If a free node is available in the lists of that manager, it is returned. - * Otherwise, the request is passed to the root node and works as if the request were made on - * the NULL stream. - * - * The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the - * mechanism to steal memory from the children induces GPU synchronizations (the manager has to - * make sure no kernel uses a given buffer before stealing it) and it the execution is - * sequential (in a multi-threaded context, the code is executed in a critical section inside - * the cnmem library - no need for the user to wrap cnmemMalloc with locks). - * - * \return - * CNMEM_STATUS_SUCCESS, if everything goes fine, - * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, - * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0, - * CNMEM_STATUS_OUT_OF_MEMORY, if there is not enough memory available, - * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. - */ -cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream); - -/** - * \brief Release memory. - * - * This function releases memory and recycles a memory block in the manager. This function is - * thread safe. - * - * \return - * CNMEM_STATUS_SUCCESS, if everything goes fine, - * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, - * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0, - * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. - */ -cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream); - -/* ********************************************************************************************* */ -/* Utility functions. */ -/* ********************************************************************************************* */ - -/** - * \brief Returns the amount of memory managed by the memory manager associated with a stream. - * - * The pointers totalMem and freeMem must be valid. At the moment, this function has a comple- - * xity linear in the number of allocated blocks so do not call it in performance critical - * sections. - * - * \return - * CNMEM_STATUS_SUCCESS, if everything goes fine, - * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, - * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid, - * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. - */ -cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream); - -/** - * \brief Print a list of nodes to a file. - * - * This function is intended to be used in case of complex scenarios to help understand the - * behaviour of the memory managers/application. It is thread safe. - * - * \return - * CNMEM_STATUS_SUCCESS, if everything goes fine, - * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, - * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0 - * or free_mem == 0, - * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. - */ -cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream); - -/** - * \brief Converts a cnmemStatus_t value to a string. - */ -const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status); - -/* ********************************************************************************************* */ - -#ifdef __cplusplus -} // extern "C" -#endif - diff --git a/third_party/nccl b/third_party/nccl new file mode 160000 +Subproject b3a9e1333d9e2e1b8553b5843ba1ba4f7c79739 |