1 files changed, 191 insertions, 13 deletions
diff --git a/Modules/FindCUDA.cmake b/Modules/FindCUDA.cmake
index 5a834b128..423ad3d13 100644
--- a/Modules/FindCUDA.cmake
+++ b/Modules/FindCUDA.cmake
@@ -91,6 +91,13 @@
 #     CUDA_ADD_LIBRARY, CUDA_ADD_EXECUTABLE, or CUDA_WRAP_SRCS.  Flags used for
 #     shared library compilation are not affected by this flag.
 #
+#  CUDA_SEPARABLE_COMPILATION (Default OFF)
+#  -- If set this will enable separable compilation for all CUDA runtime object
+#     files.  If used outside of CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY
+#     (e.g. calling CUDA_WRAP_SRCS directly),
+#     CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME and
+#     CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS should be called.
+#
 #  CUDA_VERBOSE_BUILD (Default OFF)
 #  -- Set to ON to see all the commands used when building the CUDA file.  When
 #     using a Makefile generator the value defaults to VERBOSE (run make
@@ -137,11 +144,36 @@
 #  CUDA_COMPILE_PTX( generated_files file0 file1 ... [OPTIONS ...] )
 #  -- Returns a list of PTX files generated from the input source files.
 #
+#  CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME( output_file_var
+#                                                       cuda_target
+#                                                       object_files )
+#  -- Compute the name of the intermediate link file used for separable
+#     compilation.  This file name is typically passed into
+#     CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS.  output_file_var is produced
+#     based on cuda_target the list of objects files that need separable
+#     compilation as specified by object_files.  If the object_files list is
+#     empty, then output_file_var will be empty.  This function is called
+#     automatically for CUDA_ADD_LIBRARY and CUDA_ADD_EXECUTABLE.  Note that
+#     this is a function and not a macro.
+#
 #  CUDA_INCLUDE_DIRECTORIES( path0 path1 ... )
 #  -- Sets the directories that should be passed to nvcc
 #     (e.g. nvcc -Ipath0 -Ipath1 ... ). These paths usually contain other .cu
 #     files.
 #
+#
+#  CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS( output_file_var cuda_target
+#                                           nvcc_flags object_files)
+#
+#  -- Generates the link object required by separable compilation from the given
+#     object files.  This is called automatically for CUDA_ADD_EXECUTABLE and
+#     CUDA_ADD_LIBRARY, but can be called manually when using CUDA_WRAP_SRCS
+#     directly.  When called from CUDA_ADD_LIBRARY or CUDA_ADD_EXECUTABLE the
+#     nvcc_flags passed in are the same as the flags passed in via the OPTIONS
+#     argument.  The only nvcc flag added automatically is the bitness flag as
+#     specified by CUDA_64_BIT_DEVICE_CODE.  Note that this is a function
+#     instead of a macro.
+#
 #  CUDA_WRAP_SRCS ( cuda_target format generated_files file0 file1 ...
 #                   [STATIC | SHARED | MODULE] [OPTIONS ...] )
 #  -- This is where all the magic happens.  CUDA_ADD_EXECUTABLE,
@@ -223,6 +255,8 @@
 #  CUDA_CUBLAS_LIBRARIES -- Device or emulation library for the Cuda BLAS
 #                           implementation (alterative to:
 #                           CUDA_ADD_CUBLAS_TO_TARGET macro).
+#  CUDA_cupti_LIBRARY    -- CUDA Profiling Tools Interface library.
+#                           Only available for CUDA version 4.0+.
 #  CUDA_curand_LIBRARY   -- CUDA Random Number Generation library.
 #                           Only available for CUDA version 3.2+.
 #  CUDA_cusparse_LIBRARY -- CUDA Sparse Matrix library.
@@ -406,6 +440,9 @@ endif()
 # Propagate the host flags to the host compiler via -Xcompiler
 option(CUDA_PROPAGATE_HOST_FLAGS "Propage C/CXX_FLAGS and friends to the host compiler via -Xcompile" ON)
 
+# Enable CUDA_SEPARABLE_COMPILATION
+option(CUDA_SEPARABLE_COMPILATION "Compile CUDA objects with separable compilation enabled.  Requires CUDA 5.0+" OFF)
+
 # Specifies whether the commands used when compiling the .cu file will be printed out.
 option(CUDA_VERBOSE_BUILD "Print out the commands run while compiling the CUDA source file.  With the Makefile generator this defaults to VERBOSE variable specified on the command line, but can be forced on with this option." OFF)
 
@@ -451,6 +488,7 @@ if(NOT "${CUDA_TOOLKIT_ROOT_DIR}" STREQUAL "${CUDA_TOOLKIT_ROOT_DIR_INTERNAL}")
   endif()
   unset(CUDA_VERSION CACHE)
   unset(CUDA_CUDA_LIBRARY CACHE)
+  unset(CUDA_cupti_LIBRARY CACHE)
   unset(CUDA_cublas_LIBRARY CACHE)
   unset(CUDA_cublasemu_LIBRARY CACHE)
   unset(CUDA_cufft_LIBRARY CACHE)
@@ -551,11 +589,11 @@ mark_as_advanced(CUDA_TOOLKIT_INCLUDE)
 set (CUDA_NVCC_INCLUDE_ARGS_USER "")
 set (CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
 
-macro(FIND_LIBRARY_LOCAL_FIRST _var _names _doc)
+macro(cuda_find_library_local_first_with_path_ext _var _names _doc _path_ext )
   if(CMAKE_SIZEOF_VOID_P EQUAL 8)
     # CUDA 3.2+ on Windows moved the library directories, so we need the new
     # and old paths.
-    set(_cuda_64bit_lib_dir "lib/x64" "lib64" )
+    set(_cuda_64bit_lib_dir "${_path_ext}lib/x64" "${_path_ext}lib64" "${_path_ext}libx64" )
   endif()
   # CUDA 3.2+ on Windows moved the library directories, so we need to new
   # (lib/Win32) and the old path (lib).
@@ -564,7 +602,7 @@ macro(FIND_LIBRARY_LOCAL_FIRST _var _names _doc)
     PATHS "${CUDA_TOOLKIT_ROOT_DIR}"
     ENV CUDA_PATH
     ENV CUDA_LIB_PATH
-    PATH_SUFFIXES ${_cuda_64bit_lib_dir} "lib/Win32" "lib"
+    PATH_SUFFIXES ${_cuda_64bit_lib_dir} "${_path_ext}lib/Win32" "${_path_ext}lib" "${_path_ext}libWin32"
     DOC ${_doc}
     NO_DEFAULT_PATH
     )
@@ -572,15 +610,31 @@ macro(FIND_LIBRARY_LOCAL_FIRST _var _names _doc)
   find_library(${_var} NAMES ${_names} DOC ${_doc})
 endmacro()
 
+macro(cuda_find_library_local_first _var _names _doc)
+  cuda_find_library_local_first_with_path_ext( "${_var}" "${_names}" "${_doc}" "" )
+endmacro()
+
+macro(find_library_local_first _var _names _doc )
+  cuda_find_library_local_first( "${_var}" "${_names}" "${_doc}" "" )
+endmacro()
+
+
 # CUDA_LIBRARIES
-find_library_local_first(CUDA_CUDART_LIBRARY cudart "\"cudart\" library")
+cuda_find_library_local_first(CUDA_CUDART_LIBRARY cudart "\"cudart\" library")
 if(CUDA_VERSION VERSION_EQUAL "3.0")
   # The cudartemu library only existed for the 3.0 version of CUDA.
-  find_library_local_first(CUDA_CUDARTEMU_LIBRARY cudartemu "\"cudartemu\" library")
+  cuda_find_library_local_first(CUDA_CUDARTEMU_LIBRARY cudartemu "\"cudartemu\" library")
   mark_as_advanced(
     CUDA_CUDARTEMU_LIBRARY
     )
 endif()
+
+# CUPTI library showed up in cuda toolkit 4.0
+if(NOT CUDA_VERSION VERSION_LESS "4.0")
+  cuda_find_library_local_first_with_path_ext(CUDA_cupti_LIBRARY cupti "\"cupti\" library" "extras/CUPTI/")
+  mark_as_advanced(CUDA_cupti_LIBRARY)
+endif()
+
 # If we are using emulation mode and we found the cudartemu library then use
 # that one instead of cudart.
 if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY)
@@ -603,12 +657,7 @@ endif()
 
 # 1.1 toolkit on linux doesn't appear to have a separate library on
 # some platforms.
-find_library_local_first(CUDA_CUDA_LIBRARY cuda "\"cuda\" library (older versions only).")
-
-# Add cuda library to the link line only if it is found.
-if (CUDA_CUDA_LIBRARY)
-  set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
-endif()
+cuda_find_library_local_first(CUDA_CUDA_LIBRARY cuda "\"cuda\" library (older versions only).")
 
 mark_as_advanced(
   CUDA_CUDA_LIBRARY
@@ -618,7 +667,7 @@ mark_as_advanced(
 #######################
 # Look for some of the toolkit helper libraries
 macro(FIND_CUDA_HELPER_LIBS _name)
-  find_library_local_first(CUDA_${_name}_LIBRARY ${_name} "\"${_name}\" library")
+  cuda_find_library_local_first(CUDA_${_name}_LIBRARY ${_name} "\"${_name}\" library")
   mark_as_advanced(CUDA_${_name}_LIBRARY)
 endmacro()
 
@@ -913,6 +962,11 @@ endfunction()
 
 macro(CUDA_WRAP_SRCS cuda_target format generated_files)
 
+  # If CMake doesn't support separable compilation, complain
+  if(CUDA_SEPARABLE_COMPILATION AND CMAKE_VERSION VERSION_LESS "2.8.10.1")
+    message(SEND_ERROR "CUDA_SEPARABLE_COMPILATION isn't supported for CMake versions less than 2.8.10.1")
+  endif()
+
   # Set up all the command line flags here, so that they can be overridden on a per target basis.
 
   set(nvcc_flags "")
@@ -950,6 +1004,8 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
   # $(VCInstallDir)/bin.
   if(CMAKE_GENERATOR MATCHES "Visual Studio")
     set(ccbin_flags -D "\"CCBIN:PATH=$(VCInstallDir)bin\"" )
+  else()
+    set(ccbin_flags)
   endif()
 
   # Figure out which configure we will use and pass that in as an argument to
@@ -1117,7 +1173,11 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
       else()
         set(generated_file_path "${cuda_compile_output_dir}/${CMAKE_CFG_INTDIR}")
         set(generated_file_basename "${cuda_target}_generated_${basename}${generated_extension}")
-        set(format_flag "-c")
+        if(CUDA_SEPARABLE_COMPILATION)
+          set(format_flag "-dc")
+        else()
+          set(format_flag "-c")
+        endif()
       endif()
 
       # Set all of our file names.  Make sure that whatever filenames that have
@@ -1146,6 +1206,10 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
         set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
       endif()
 
+      if( NOT compile_to_ptx AND CUDA_SEPARABLE_COMPILATION)
+        list(APPEND ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS "${generated_file}")
+      endif()
+
       # Bring in the dependencies.  Creates a variable CUDA_NVCC_DEPEND #######
       cuda_include_nvcc_dependencies(${cmake_dependency_file})
 
@@ -1244,6 +1308,100 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
   set(${generated_files} ${_cuda_wrap_generated_files})
 endmacro()
 
+function(_cuda_get_important_host_flags important_flags flag_string)
+  if(CMAKE_GENERATOR MATCHES "Visual Studio")
+    string(REGEX MATCHALL "/M[DT][d]?" flags ${flag_string})
+    list(APPEND ${important_flags} ${flags})
+  else()
+    string(REGEX MATCHALL "-fPIC" flags ${flag_string})
+    list(APPEND ${important_flags} ${flags})
+  endif()
+  set(${important_flags} ${${important_flags}} PARENT_SCOPE)
+endfunction()
+
+###############################################################################
+###############################################################################
+# Separable Compilation Link
+###############################################################################
+###############################################################################
+
+# Compute the filename to be used by CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS
+function(CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME output_file_var cuda_target object_files)
+  if (object_files)
+    set(generated_extension ${CMAKE_${CUDA_C_OR_CXX}_OUTPUT_EXTENSION})
+    set(output_file "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${cuda_target}.dir/${CMAKE_CFG_INTDIR}/${cuda_target}_intermediate_link${generated_extension}")
+  else()
+    set(output_file)
+  endif()
+
+  set(${output_file_var} "${output_file}" PARENT_SCOPE)
+endfunction()
+
+# Setup the build rule for the separable compilation intermediate link file.
+function(CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS output_file cuda_target options object_files)
+  if (object_files)
+
+    set_source_files_properties("${output_file}"
+      PROPERTIES
+      EXTERNAL_OBJECT TRUE # This is an object file not to be compiled, but only
+                           # be linked.
+      GENERATED TRUE       # This file is generated during the build
+      )
+
+    # For now we are ignoring all the configuration specific flags.
+    set(nvcc_flags)
+    CUDA_PARSE_NVCC_OPTIONS(nvcc_flags ${options})
+    if(CUDA_64_BIT_DEVICE_CODE)
+      list(APPEND nvcc_flags -m64)
+    else()
+      list(APPEND nvcc_flags -m32)
+    endif()
+    # If -ccbin, --compiler-bindir has been specified, don't do anything.  Otherwise add it here.
+    list( FIND nvcc_flags "-ccbin" ccbin_found0 )
+    list( FIND nvcc_flags "--compiler-bindir" ccbin_found1 )
+    if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+      list(APPEND nvcc_flags -ccbin "\"${CUDA_HOST_COMPILER}\"")
+    endif()
+    set(flags)
+    foreach(config ${CUDA_configuration_types})
+      string(TOUPPER ${config} config_upper)
+      set(important_host_flags)
+      _cuda_get_important_host_flags(important_host_flags ${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}})
+      foreach(f ${important_host_flags})
+        list(APPEND flags $<$<CONFIG:${config}>:-Xcompiler> $<$<CONFIG:${config}>:${f}>)
+      endforeach()
+    endforeach()
+    file(RELATIVE_PATH output_file_relative_path "${CMAKE_BINARY_DIR}" "${output_file}")
+
+    # Some generators don't handle the multiple levels of custom command
+    # dependencies correctly (obj1 depends on file1, obj2 depends on obj1), so
+    # we work around that issue by compiling the intermediate link object as a
+    # pre-link custom command in that situation.
+    set(do_obj_build_rule TRUE)
+    if (MSVC_VERSION GREATER 1599)
+      # VS 2010 and 2012 have this problem.  If future versions fix this issue,
+      # it should still work, it just won't be as nice as the other method.
+      set(do_obj_build_rule FALSE)
+    endif()
+
+    if (do_obj_build_rule)
+      add_custom_command(
+        OUTPUT ${output_file}
+        DEPENDS ${object_files}
+        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} -dlink ${object_files} -o ${output_file}
+        ${flags}
+        COMMENT "Building NVCC intermediate link file ${output_file_relative_path}"
+        )
+    else()
+      add_custom_command(
+        TARGET ${cuda_target}
+        PRE_LINK
+        COMMAND ${CMAKE_COMMAND} -E echo "Building NVCC intermediate link file ${output_file_relative_path}"
+        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} ${flags} -dlink ${object_files} -o "${output_file}"
+        )
+    endif()
+ endif()
+endfunction()
 
 ###############################################################################
 ###############################################################################
@@ -1262,12 +1420,22 @@ macro(CUDA_ADD_LIBRARY cuda_target)
     ${_cmake_options} ${_cuda_shared_flag}
     OPTIONS ${_options} )
 
+  # Compute the file name of the intermedate link file used for separable
+  # compilation.
+  CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
   # Add the library.
   add_library(${cuda_target} ${_cmake_options}
     ${_generated_files}
     ${_sources}
+    ${link_file}
     )
 
+  # Add a link phase for the separable compilation if it has been enabled.  If
+  # it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS
+  # variable will have been defined.
+  CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
   target_link_libraries(${cuda_target}
     ${CUDA_LIBRARIES}
     )
@@ -1296,12 +1464,22 @@ macro(CUDA_ADD_EXECUTABLE cuda_target)
   # Create custom commands and targets for each file.
   CUDA_WRAP_SRCS( ${cuda_target} OBJ _generated_files ${_sources} OPTIONS ${_options} )
 
+  # Compute the file name of the intermedate link file used for separable
+  # compilation.
+  CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
   # Add the library.
   add_executable(${cuda_target} ${_cmake_options}
     ${_generated_files}
     ${_sources}
+    ${link_file}
     )
 
+  # Add a link phase for the separable compilation if it has been enabled.  If
+  # it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS
+  # variable will have been defined.
+  CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
   target_link_libraries(${cuda_target}
     ${CUDA_LIBRARIES}
     )