1 files changed, 745 insertions, 411 deletions
diff --git a/Modules/FindCUDA.cmake b/Modules/FindCUDA.cmake
index 2705d3271..a4dca5433 100644
--- a/Modules/FindCUDA.cmake
+++ b/Modules/FindCUDA.cmake
@@ -1,292 +1,351 @@
-# - Tools for building CUDA C files: libraries and build dependencies.
-# This script locates the NVIDIA CUDA C tools. It should work on linux, windows,
-# and mac and should be reasonably up to date with CUDA C releases.
-#
-# This script makes use of the standard find_package arguments of <VERSION>,
-# REQUIRED and QUIET.  CUDA_FOUND will report if an acceptable version of CUDA
-# was found.
-#
-# The script will prompt the user to specify CUDA_TOOLKIT_ROOT_DIR if the prefix
-# cannot be determined by the location of nvcc in the system path and REQUIRED
-# is specified to find_package(). To use a different installed version of the
-# toolkit set the environment variable CUDA_BIN_PATH before running cmake
-# (e.g. CUDA_BIN_PATH=/usr/local/cuda1.0 instead of the default /usr/local/cuda)
-# or set CUDA_TOOLKIT_ROOT_DIR after configuring.  If you change the value of
-# CUDA_TOOLKIT_ROOT_DIR, various components that depend on the path will be
-# relocated.
+#.rst:
+# FindCUDA
+# --------
+#
+# Tools for building CUDA C files: libraries and build dependencies.
+#
+# This script locates the NVIDIA CUDA C tools.  It should work on linux,
+# windows, and mac and should be reasonably up to date with CUDA C
+# releases.
+#
+# This script makes use of the standard find_package arguments of
+# <VERSION>, REQUIRED and QUIET.  CUDA_FOUND will report if an
+# acceptable version of CUDA was found.
+#
+# The script will prompt the user to specify CUDA_TOOLKIT_ROOT_DIR if
+# the prefix cannot be determined by the location of nvcc in the system
+# path and REQUIRED is specified to find_package().  To use a different
+# installed version of the toolkit set the environment variable
+# CUDA_BIN_PATH before running cmake (e.g.
+# CUDA_BIN_PATH=/usr/local/cuda1.0 instead of the default
+# /usr/local/cuda) or set CUDA_TOOLKIT_ROOT_DIR after configuring.  If
+# you change the value of CUDA_TOOLKIT_ROOT_DIR, various components that
+# depend on the path will be relocated.
 #
 # It might be necessary to set CUDA_TOOLKIT_ROOT_DIR manually on certain
-# platforms, or to use a cuda runtime not installed in the default location. In
-# newer versions of the toolkit the cuda library is included with the graphics
-# driver- be sure that the driver version matches what is needed by the cuda
-# runtime version.
-#
-# The following variables affect the behavior of the macros in the script (in
-# alphebetical order).  Note that any of these flags can be changed multiple
-# times in the same directory before calling CUDA_ADD_EXECUTABLE,
-# CUDA_ADD_LIBRARY, CUDA_COMPILE, CUDA_COMPILE_PTX or CUDA_WRAP_SRCS.
-#
-#  CUDA_64_BIT_DEVICE_CODE (Default matches host bit size)
-#  -- Set to ON to compile for 64 bit device code, OFF for 32 bit device code.
-#     Note that making this different from the host code when generating object
-#     or C files from CUDA code just won't work, because size_t gets defined by
-#     nvcc in the generated source.  If you compile to PTX and then load the
-#     file yourself, you can mix bit sizes between device and host.
-#
-#  CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE (Default ON)
-#  -- Set to ON if you want the custom build rule to be attached to the source
-#     file in Visual Studio.  Turn OFF if you add the same cuda file to multiple
-#     targets.
-#
-#     This allows the user to build the target from the CUDA file; however, bad
-#     things can happen if the CUDA source file is added to multiple targets.
-#     When performing parallel builds it is possible for the custom build
-#     command to be run more than once and in parallel causing cryptic build
-#     errors.  VS runs the rules for every source file in the target, and a
-#     source can have only one rule no matter how many projects it is added to.
-#     When the rule is run from multiple targets race conditions can occur on
-#     the generated file.  Eventually everything will get built, but if the user
-#     is unaware of this behavior, there may be confusion.  It would be nice if
-#     this script could detect the reuse of source files across multiple targets
-#     and turn the option off for the user, but no good solution could be found.
-#
-#  CUDA_BUILD_CUBIN (Default OFF)
-#  -- Set to ON to enable and extra compilation pass with the -cubin option in
-#     Device mode. The output is parsed and register, shared memory usage is
-#     printed during build.
-#
-#  CUDA_BUILD_EMULATION (Default OFF for device mode)
-#  -- Set to ON for Emulation mode. -D_DEVICEEMU is defined for CUDA C files
-#     when CUDA_BUILD_EMULATION is TRUE.
-#
-#  CUDA_GENERATED_OUTPUT_DIR (Default CMAKE_CURRENT_BINARY_DIR)
-#  -- Set to the path you wish to have the generated files placed.  If it is
-#     blank output files will be placed in CMAKE_CURRENT_BINARY_DIR.
-#     Intermediate files will always be placed in
-#     CMAKE_CURRENT_BINARY_DIR/CMakeFiles.
-#
-#  CUDA_HOST_COMPILATION_CPP (Default ON)
-#  -- Set to OFF for C compilation of host code.
-#
-#  CUDA_HOST_COMPILER (Default CMAKE_C_COMPILER, $(VCInstallDir)/bin for VS)
-#  -- Set the host compiler to be used by nvcc.  Ignored if -ccbin or
-#     --compiler-bindir is already present in the CUDA_NVCC_FLAGS or
-#     CUDA_NVCC_FLAGS_<CONFIG> variables.  For Visual Studio targets
-#     $(VCInstallDir)/bin is a special value that expands out to the path when
-#     the command is run from withing VS.
-#
-#  CUDA_NVCC_FLAGS
-#  CUDA_NVCC_FLAGS_<CONFIG>
-#  -- Additional NVCC command line arguments.  NOTE: multiple arguments must be
-#     semi-colon delimited (e.g. --compiler-options;-Wall)
-#
-#  CUDA_PROPAGATE_HOST_FLAGS (Default ON)
-#  -- Set to ON to propagate CMAKE_{C,CXX}_FLAGS and their configuration
-#     dependent counterparts (e.g. CMAKE_C_FLAGS_DEBUG) automatically to the
-#     host compiler through nvcc's -Xcompiler flag.  This helps make the
-#     generated host code match the rest of the system better.  Sometimes
-#     certain flags give nvcc problems, and this will help you turn the flag
-#     propagation off.  This does not affect the flags supplied directly to nvcc
-#     via CUDA_NVCC_FLAGS or through the OPTION flags specified through
-#     CUDA_ADD_LIBRARY, CUDA_ADD_EXECUTABLE, or CUDA_WRAP_SRCS.  Flags used for
-#     shared library compilation are not affected by this flag.
-#
-#  CUDA_SEPARABLE_COMPILATION (Default OFF)
-#  -- If set this will enable separable compilation for all CUDA runtime object
-#     files.  If used outside of CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY
-#     (e.g. calling CUDA_WRAP_SRCS directly),
-#     CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME and
-#     CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS should be called.
-#
-#  CUDA_VERBOSE_BUILD (Default OFF)
-#  -- Set to ON to see all the commands used when building the CUDA file.  When
-#     using a Makefile generator the value defaults to VERBOSE (run make
-#     VERBOSE=1 to see output), although setting CUDA_VERBOSE_BUILD to ON will
-#     always print the output.
-#
-# The script creates the following macros (in alphebetical order):
-#
-#  CUDA_ADD_CUFFT_TO_TARGET( cuda_target )
-#  -- Adds the cufft library to the target (can be any target).  Handles whether
-#     you are in emulation mode or not.
-#
-#  CUDA_ADD_CUBLAS_TO_TARGET( cuda_target )
-#  -- Adds the cublas library to the target (can be any target).  Handles
-#     whether you are in emulation mode or not.
-#
-#  CUDA_ADD_EXECUTABLE( cuda_target file0 file1 ...
-#                       [WIN32] [MACOSX_BUNDLE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
-#  -- Creates an executable "cuda_target" which is made up of the files
-#     specified.  All of the non CUDA C files are compiled using the standard
-#     build rules specified by CMAKE and the cuda files are compiled to object
-#     files using nvcc and the host compiler.  In addition CUDA_INCLUDE_DIRS is
-#     added automatically to include_directories().  Some standard CMake target
-#     calls can be used on the target after calling this macro
-#     (e.g. set_target_properties and target_link_libraries), but setting
-#     properties that adjust compilation flags will not affect code compiled by
-#     nvcc.  Such flags should be modified before calling CUDA_ADD_EXECUTABLE,
-#     CUDA_ADD_LIBRARY or CUDA_WRAP_SRCS.
-#
-#  CUDA_ADD_LIBRARY( cuda_target file0 file1 ...
-#                    [STATIC | SHARED | MODULE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
-#  -- Same as CUDA_ADD_EXECUTABLE except that a library is created.
-#
-#  CUDA_BUILD_CLEAN_TARGET()
-#  -- Creates a convience target that deletes all the dependency files
-#     generated.  You should make clean after running this target to ensure the
-#     dependency files get regenerated.
-#
-#  CUDA_COMPILE( generated_files file0 file1 ... [STATIC | SHARED | MODULE]
-#                [OPTIONS ...] )
-#  -- Returns a list of generated files from the input source files to be used
-#     with ADD_LIBRARY or ADD_EXECUTABLE.
-#
-#  CUDA_COMPILE_PTX( generated_files file0 file1 ... [OPTIONS ...] )
-#  -- Returns a list of PTX files generated from the input source files.
-#
-#  CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME( output_file_var
-#                                                       cuda_target
-#                                                       object_files )
-#  -- Compute the name of the intermediate link file used for separable
-#     compilation.  This file name is typically passed into
-#     CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS.  output_file_var is produced
-#     based on cuda_target the list of objects files that need separable
-#     compilation as specified by object_files.  If the object_files list is
-#     empty, then output_file_var will be empty.  This function is called
-#     automatically for CUDA_ADD_LIBRARY and CUDA_ADD_EXECUTABLE.  Note that
-#     this is a function and not a macro.
-#
-#  CUDA_INCLUDE_DIRECTORIES( path0 path1 ... )
-#  -- Sets the directories that should be passed to nvcc
-#     (e.g. nvcc -Ipath0 -Ipath1 ... ). These paths usually contain other .cu
-#     files.
-#
-#
-#  CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS( output_file_var cuda_target
-#                                           nvcc_flags object_files)
-#
-#  -- Generates the link object required by separable compilation from the given
-#     object files.  This is called automatically for CUDA_ADD_EXECUTABLE and
-#     CUDA_ADD_LIBRARY, but can be called manually when using CUDA_WRAP_SRCS
-#     directly.  When called from CUDA_ADD_LIBRARY or CUDA_ADD_EXECUTABLE the
-#     nvcc_flags passed in are the same as the flags passed in via the OPTIONS
-#     argument.  The only nvcc flag added automatically is the bitness flag as
-#     specified by CUDA_64_BIT_DEVICE_CODE.  Note that this is a function
-#     instead of a macro.
-#
-#  CUDA_WRAP_SRCS ( cuda_target format generated_files file0 file1 ...
-#                   [STATIC | SHARED | MODULE] [OPTIONS ...] )
-#  -- This is where all the magic happens.  CUDA_ADD_EXECUTABLE,
-#     CUDA_ADD_LIBRARY, CUDA_COMPILE, and CUDA_COMPILE_PTX all call this
-#     function under the hood.
-#
-#     Given the list of files (file0 file1 ... fileN) this macro generates
-#     custom commands that generate either PTX or linkable objects (use "PTX" or
-#     "OBJ" for the format argument to switch).  Files that don't end with .cu
-#     or have the HEADER_FILE_ONLY property are ignored.
-#
-#     The arguments passed in after OPTIONS are extra command line options to
-#     give to nvcc.  You can also specify per configuration options by
-#     specifying the name of the configuration followed by the options.  General
-#     options must preceed configuration specific options.  Not all
-#     configurations need to be specified, only the ones provided will be used.
-#
-#        OPTIONS -DFLAG=2 "-DFLAG_OTHER=space in flag"
-#        DEBUG -g
-#        RELEASE --use_fast_math
-#        RELWITHDEBINFO --use_fast_math;-g
-#        MINSIZEREL --use_fast_math
-#
-#     For certain configurations (namely VS generating object files with
-#     CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE set to ON), no generated file will
-#     be produced for the given cuda file.  This is because when you add the
-#     cuda file to Visual Studio it knows that this file produces an object file
-#     and will link in the resulting object file automatically.
-#
-#     This script will also generate a separate cmake script that is used at
-#     build time to invoke nvcc.  This is for several reasons.
-#
-#       1. nvcc can return negative numbers as return values which confuses
-#       Visual Studio into thinking that the command succeeded.  The script now
-#       checks the error codes and produces errors when there was a problem.
-#
-#       2. nvcc has been known to not delete incomplete results when it
-#       encounters problems.  This confuses build systems into thinking the
-#       target was generated when in fact an unusable file exists.  The script
-#       now deletes the output files if there was an error.
-#
-#       3. By putting all the options that affect the build into a file and then
-#       make the build rule dependent on the file, the output files will be
-#       regenerated when the options change.
-#
-#     This script also looks at optional arguments STATIC, SHARED, or MODULE to
-#     determine when to target the object compilation for a shared library.
-#     BUILD_SHARED_LIBS is ignored in CUDA_WRAP_SRCS, but it is respected in
-#     CUDA_ADD_LIBRARY.  On some systems special flags are added for building
-#     objects intended for shared libraries.  A preprocessor macro,
-#     <target_name>_EXPORTS is defined when a shared library compilation is
-#     detected.
-#
-#     Flags passed into add_definitions with -D or /D are passed along to nvcc.
-#
-# The script defines the following variables:
-#
-#  CUDA_VERSION_MAJOR    -- The major version of cuda as reported by nvcc.
-#  CUDA_VERSION_MINOR    -- The minor version.
-#  CUDA_VERSION
-#  CUDA_VERSION_STRING   -- CUDA_VERSION_MAJOR.CUDA_VERSION_MINOR
-#
-#  CUDA_TOOLKIT_ROOT_DIR -- Path to the CUDA Toolkit (defined if not set).
-#  CUDA_SDK_ROOT_DIR     -- Path to the CUDA SDK.  Use this to find files in the
-#                           SDK.  This script will not directly support finding
-#                           specific libraries or headers, as that isn't
-#                           supported by NVIDIA.  If you want to change
-#                           libraries when the path changes see the
-#                           FindCUDA.cmake script for an example of how to clear
-#                           these variables.  There are also examples of how to
-#                           use the CUDA_SDK_ROOT_DIR to locate headers or
-#                           libraries, if you so choose (at your own risk).
-#  CUDA_INCLUDE_DIRS     -- Include directory for cuda headers.  Added automatically
-#                           for CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY.
-#  CUDA_LIBRARIES        -- Cuda RT library.
-#  CUDA_CUFFT_LIBRARIES  -- Device or emulation library for the Cuda FFT
-#                           implementation (alternative to:
-#                           CUDA_ADD_CUFFT_TO_TARGET macro)
-#  CUDA_CUBLAS_LIBRARIES -- Device or emulation library for the Cuda BLAS
-#                           implementation (alterative to:
-#                           CUDA_ADD_CUBLAS_TO_TARGET macro).
-#  CUDA_cupti_LIBRARY    -- CUDA Profiling Tools Interface library.
-#                           Only available for CUDA version 4.0+.
-#  CUDA_curand_LIBRARY   -- CUDA Random Number Generation library.
-#                           Only available for CUDA version 3.2+.
-#  CUDA_cusparse_LIBRARY -- CUDA Sparse Matrix library.
-#                           Only available for CUDA version 3.2+.
-#  CUDA_npp_LIBRARY      -- NVIDIA Performance Primitives library.
-#                           Only available for CUDA version 4.0+.
-#  CUDA_nppc_LIBRARY      -- NVIDIA Performance Primitives library (core).
-#                           Only available for CUDA version 5.5+.
-#  CUDA_nppi_LIBRARY      -- NVIDIA Performance Primitives library (image processing).
-#                           Only available for CUDA version 5.5+.
-#  CUDA_npps_LIBRARY      -- NVIDIA Performance Primitives library (signal processing).
-#                           Only available for CUDA version 5.5+.
-#  CUDA_nvcuvenc_LIBRARY -- CUDA Video Encoder library.
-#                           Only available for CUDA version 3.2+.
-#                           Windows only.
-#  CUDA_nvcuvid_LIBRARY  -- CUDA Video Decoder library.
-#                           Only available for CUDA version 3.2+.
-#                           Windows only.
-#
-#
-#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
-#  Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
-#
-#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
-#
-#  Copyright (c) 2007-2009
-#  Scientific Computing and Imaging Institute, University of Utah
-#
-#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
-#  for the text of the license.
+# platforms, or to use a cuda runtime not installed in the default
+# location.  In newer versions of the toolkit the cuda library is
+# included with the graphics driver- be sure that the driver version
+# matches what is needed by the cuda runtime version.
+#
+# The following variables affect the behavior of the macros in the
+# script (in alphebetical order).  Note that any of these flags can be
+# changed multiple times in the same directory before calling
+# CUDA_ADD_EXECUTABLE, CUDA_ADD_LIBRARY, CUDA_COMPILE, CUDA_COMPILE_PTX,
+# CUDA_COMPILE_FATBIN, CUDA_COMPILE_CUBIN or CUDA_WRAP_SRCS::
+#
+#   CUDA_64_BIT_DEVICE_CODE (Default matches host bit size)
+#   -- Set to ON to compile for 64 bit device code, OFF for 32 bit device code.
+#      Note that making this different from the host code when generating object
+#      or C files from CUDA code just won't work, because size_t gets defined by
+#      nvcc in the generated source.  If you compile to PTX and then load the
+#      file yourself, you can mix bit sizes between device and host.
+#
+#   CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE (Default ON)
+#   -- Set to ON if you want the custom build rule to be attached to the source
+#      file in Visual Studio.  Turn OFF if you add the same cuda file to multiple
+#      targets.
+#
+#      This allows the user to build the target from the CUDA file; however, bad
+#      things can happen if the CUDA source file is added to multiple targets.
+#      When performing parallel builds it is possible for the custom build
+#      command to be run more than once and in parallel causing cryptic build
+#      errors.  VS runs the rules for every source file in the target, and a
+#      source can have only one rule no matter how many projects it is added to.
+#      When the rule is run from multiple targets race conditions can occur on
+#      the generated file.  Eventually everything will get built, but if the user
+#      is unaware of this behavior, there may be confusion.  It would be nice if
+#      this script could detect the reuse of source files across multiple targets
+#      and turn the option off for the user, but no good solution could be found.
+#
+#   CUDA_BUILD_CUBIN (Default OFF)
+#   -- Set to ON to enable and extra compilation pass with the -cubin option in
+#      Device mode. The output is parsed and register, shared memory usage is
+#      printed during build.
+#
+#   CUDA_BUILD_EMULATION (Default OFF for device mode)
+#   -- Set to ON for Emulation mode. -D_DEVICEEMU is defined for CUDA C files
+#      when CUDA_BUILD_EMULATION is TRUE.
+#
+#   CUDA_LINK_LIBRARIES_KEYWORD (Default "")
+#    -- The <PRIVATE|PUBLIC|INTERFACE> keyword to use for internal
+#       target_link_libraries calls. The default is to use no keyword which
+#       uses the old "plain" form of target_link_libraries. Note that is matters
+#       because whatever is used inside the FindCUDA module must also be used
+#       outside - the two forms of target_link_libraries cannot be mixed.
+#
+#   CUDA_GENERATED_OUTPUT_DIR (Default CMAKE_CURRENT_BINARY_DIR)
+#   -- Set to the path you wish to have the generated files placed.  If it is
+#      blank output files will be placed in CMAKE_CURRENT_BINARY_DIR.
+#      Intermediate files will always be placed in
+#      CMAKE_CURRENT_BINARY_DIR/CMakeFiles.
+#
+#   CUDA_HOST_COMPILATION_CPP (Default ON)
+#   -- Set to OFF for C compilation of host code.
+#
+#   CUDA_HOST_COMPILER (Default CMAKE_C_COMPILER, $(VCInstallDir)/bin for VS)
+#   -- Set the host compiler to be used by nvcc.  Ignored if -ccbin or
+#      --compiler-bindir is already present in the CUDA_NVCC_FLAGS or
+#      CUDA_NVCC_FLAGS_<CONFIG> variables.  For Visual Studio targets
+#      $(VCInstallDir)/bin is a special value that expands out to the path when
+#      the command is run from within VS.
+#
+#   CUDA_NVCC_FLAGS
+#   CUDA_NVCC_FLAGS_<CONFIG>
+#   -- Additional NVCC command line arguments.  NOTE: multiple arguments must be
+#      semi-colon delimited (e.g. --compiler-options;-Wall)
+#
+#   CUDA_PROPAGATE_HOST_FLAGS (Default ON)
+#   -- Set to ON to propagate CMAKE_{C,CXX}_FLAGS and their configuration
+#      dependent counterparts (e.g. CMAKE_C_FLAGS_DEBUG) automatically to the
+#      host compiler through nvcc's -Xcompiler flag.  This helps make the
+#      generated host code match the rest of the system better.  Sometimes
+#      certain flags give nvcc problems, and this will help you turn the flag
+#      propagation off.  This does not affect the flags supplied directly to nvcc
+#      via CUDA_NVCC_FLAGS or through the OPTION flags specified through
+#      CUDA_ADD_LIBRARY, CUDA_ADD_EXECUTABLE, or CUDA_WRAP_SRCS.  Flags used for
+#      shared library compilation are not affected by this flag.
+#
+#   CUDA_SEPARABLE_COMPILATION (Default OFF)
+#   -- If set this will enable separable compilation for all CUDA runtime object
+#      files.  If used outside of CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY
+#      (e.g. calling CUDA_WRAP_SRCS directly),
+#      CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME and
+#      CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS should be called.
+#
+#   CUDA_SOURCE_PROPERTY_FORMAT
+#   -- If this source file property is set, it can override the format specified
+#      to CUDA_WRAP_SRCS (OBJ, PTX, CUBIN, or FATBIN).  If an input source file
+#      is not a .cu file, setting this file will cause it to be treated as a .cu
+#      file. See documentation for set_source_files_properties on how to set
+#      this property.
+#
+#   CUDA_USE_STATIC_CUDA_RUNTIME (Default ON)
+#   -- When enabled the static version of the CUDA runtime library will be used
+#      in CUDA_LIBRARIES.  If the version of CUDA configured doesn't support
+#      this option, then it will be silently disabled.
+#
+#   CUDA_VERBOSE_BUILD (Default OFF)
+#   -- Set to ON to see all the commands used when building the CUDA file.  When
+#      using a Makefile generator the value defaults to VERBOSE (run make
+#      VERBOSE=1 to see output), although setting CUDA_VERBOSE_BUILD to ON will
+#      always print the output.
+#
+# The script creates the following macros (in alphebetical order)::
+#
+#   CUDA_ADD_CUFFT_TO_TARGET( cuda_target )
+#   -- Adds the cufft library to the target (can be any target).  Handles whether
+#      you are in emulation mode or not.
+#
+#   CUDA_ADD_CUBLAS_TO_TARGET( cuda_target )
+#   -- Adds the cublas library to the target (can be any target).  Handles
+#      whether you are in emulation mode or not.
+#
+#   CUDA_ADD_EXECUTABLE( cuda_target file0 file1 ...
+#                        [WIN32] [MACOSX_BUNDLE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
+#   -- Creates an executable "cuda_target" which is made up of the files
+#      specified.  All of the non CUDA C files are compiled using the standard
+#      build rules specified by CMAKE and the cuda files are compiled to object
+#      files using nvcc and the host compiler.  In addition CUDA_INCLUDE_DIRS is
+#      added automatically to include_directories().  Some standard CMake target
+#      calls can be used on the target after calling this macro
+#      (e.g. set_target_properties and target_link_libraries), but setting
+#      properties that adjust compilation flags will not affect code compiled by
+#      nvcc.  Such flags should be modified before calling CUDA_ADD_EXECUTABLE,
+#      CUDA_ADD_LIBRARY or CUDA_WRAP_SRCS.
+#
+#   CUDA_ADD_LIBRARY( cuda_target file0 file1 ...
+#                     [STATIC | SHARED | MODULE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
+#   -- Same as CUDA_ADD_EXECUTABLE except that a library is created.
+#
+#   CUDA_BUILD_CLEAN_TARGET()
+#   -- Creates a convience target that deletes all the dependency files
+#      generated.  You should make clean after running this target to ensure the
+#      dependency files get regenerated.
+#
+#   CUDA_COMPILE( generated_files file0 file1 ... [STATIC | SHARED | MODULE]
+#                 [OPTIONS ...] )
+#   -- Returns a list of generated files from the input source files to be used
+#      with ADD_LIBRARY or ADD_EXECUTABLE.
+#
+#   CUDA_COMPILE_PTX( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of PTX files generated from the input source files.
+#
+#   CUDA_COMPILE_FATBIN( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of FATBIN files generated from the input source files.
+#
+#   CUDA_COMPILE_CUBIN( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of CUBIN files generated from the input source files.
+#
+#   CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME( output_file_var
+#                                                        cuda_target
+#                                                        object_files )
+#   -- Compute the name of the intermediate link file used for separable
+#      compilation.  This file name is typically passed into
+#      CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS.  output_file_var is produced
+#      based on cuda_target the list of objects files that need separable
+#      compilation as specified by object_files.  If the object_files list is
+#      empty, then output_file_var will be empty.  This function is called
+#      automatically for CUDA_ADD_LIBRARY and CUDA_ADD_EXECUTABLE.  Note that
+#      this is a function and not a macro.
+#
+#   CUDA_INCLUDE_DIRECTORIES( path0 path1 ... )
+#   -- Sets the directories that should be passed to nvcc
+#      (e.g. nvcc -Ipath0 -Ipath1 ... ). These paths usually contain other .cu
+#      files.
+#
+#
+#   CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS( output_file_var cuda_target
+#                                            nvcc_flags object_files)
+#   -- Generates the link object required by separable compilation from the given
+#      object files.  This is called automatically for CUDA_ADD_EXECUTABLE and
+#      CUDA_ADD_LIBRARY, but can be called manually when using CUDA_WRAP_SRCS
+#      directly.  When called from CUDA_ADD_LIBRARY or CUDA_ADD_EXECUTABLE the
+#      nvcc_flags passed in are the same as the flags passed in via the OPTIONS
+#      argument.  The only nvcc flag added automatically is the bitness flag as
+#      specified by CUDA_64_BIT_DEVICE_CODE.  Note that this is a function
+#      instead of a macro.
+#
+#   CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable [target_CUDA_architectures])
+#   -- Selects GPU arch flags for nvcc based on target_CUDA_architectures
+#      target_CUDA_architectures : Auto | Common | All | LIST(ARCH_AND_PTX ...)
+#       - "Auto" detects local machine GPU compute arch at runtime.
+#       - "Common" and "All" cover common and entire subsets of architectures
+#      ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
+#      NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal
+#      NUM: Any number. Only those pairs are currently accepted by NVCC though:
+#            2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2
+#      Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable}
+#      Additionally, sets ${out_variable}_readable to the resulting numeric list
+#      Example:
+#       CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.0 3.5+PTX 5.2(5.0) Maxwell)
+#        LIST(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS})
+#
+#      More info on CUDA architectures: https://en.wikipedia.org/wiki/CUDA
+#      Note that this is a function instead of a macro.
+#
+#   CUDA_WRAP_SRCS ( cuda_target format generated_files file0 file1 ...
+#                    [STATIC | SHARED | MODULE] [OPTIONS ...] )
+#   -- This is where all the magic happens.  CUDA_ADD_EXECUTABLE,
+#      CUDA_ADD_LIBRARY, CUDA_COMPILE, and CUDA_COMPILE_PTX all call this
+#      function under the hood.
+#
+#      Given the list of files (file0 file1 ... fileN) this macro generates
+#      custom commands that generate either PTX or linkable objects (use "PTX" or
+#      "OBJ" for the format argument to switch).  Files that don't end with .cu
+#      or have the HEADER_FILE_ONLY property are ignored.
+#
+#      The arguments passed in after OPTIONS are extra command line options to
+#      give to nvcc.  You can also specify per configuration options by
+#      specifying the name of the configuration followed by the options.  General
+#      options must precede configuration specific options.  Not all
+#      configurations need to be specified, only the ones provided will be used.
+#
+#         OPTIONS -DFLAG=2 "-DFLAG_OTHER=space in flag"
+#         DEBUG -g
+#         RELEASE --use_fast_math
+#         RELWITHDEBINFO --use_fast_math;-g
+#         MINSIZEREL --use_fast_math
+#
+#      For certain configurations (namely VS generating object files with
+#      CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE set to ON), no generated file will
+#      be produced for the given cuda file.  This is because when you add the
+#      cuda file to Visual Studio it knows that this file produces an object file
+#      and will link in the resulting object file automatically.
+#
+#      This script will also generate a separate cmake script that is used at
+#      build time to invoke nvcc.  This is for several reasons.
+#
+#        1. nvcc can return negative numbers as return values which confuses
+#        Visual Studio into thinking that the command succeeded.  The script now
+#        checks the error codes and produces errors when there was a problem.
+#
+#        2. nvcc has been known to not delete incomplete results when it
+#        encounters problems.  This confuses build systems into thinking the
+#        target was generated when in fact an unusable file exists.  The script
+#        now deletes the output files if there was an error.
+#
+#        3. By putting all the options that affect the build into a file and then
+#        make the build rule dependent on the file, the output files will be
+#        regenerated when the options change.
+#
+#      This script also looks at optional arguments STATIC, SHARED, or MODULE to
+#      determine when to target the object compilation for a shared library.
+#      BUILD_SHARED_LIBS is ignored in CUDA_WRAP_SRCS, but it is respected in
+#      CUDA_ADD_LIBRARY.  On some systems special flags are added for building
+#      objects intended for shared libraries.  A preprocessor macro,
+#      <target_name>_EXPORTS is defined when a shared library compilation is
+#      detected.
+#
+#      Flags passed into add_definitions with -D or /D are passed along to nvcc.
+#
+#
+#
+# The script defines the following variables::
+#
+#   CUDA_VERSION_MAJOR    -- The major version of cuda as reported by nvcc.
+#   CUDA_VERSION_MINOR    -- The minor version.
+#   CUDA_VERSION
+#   CUDA_VERSION_STRING   -- CUDA_VERSION_MAJOR.CUDA_VERSION_MINOR
+#   CUDA_HAS_FP16         -- Whether a short float (float16,fp16) is supported.
+#
+#   CUDA_TOOLKIT_ROOT_DIR -- Path to the CUDA Toolkit (defined if not set).
+#   CUDA_SDK_ROOT_DIR     -- Path to the CUDA SDK.  Use this to find files in the
+#                            SDK.  This script will not directly support finding
+#                            specific libraries or headers, as that isn't
+#                            supported by NVIDIA.  If you want to change
+#                            libraries when the path changes see the
+#                            FindCUDA.cmake script for an example of how to clear
+#                            these variables.  There are also examples of how to
+#                            use the CUDA_SDK_ROOT_DIR to locate headers or
+#                            libraries, if you so choose (at your own risk).
+#   CUDA_INCLUDE_DIRS     -- Include directory for cuda headers.  Added automatically
+#                            for CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY.
+#   CUDA_LIBRARIES        -- Cuda RT library.
+#   CUDA_CUFFT_LIBRARIES  -- Device or emulation library for the Cuda FFT
+#                            implementation (alternative to:
+#                            CUDA_ADD_CUFFT_TO_TARGET macro)
+#   CUDA_CUBLAS_LIBRARIES -- Device or emulation library for the Cuda BLAS
+#                            implementation (alternative to:
+#                            CUDA_ADD_CUBLAS_TO_TARGET macro).
+#   CUDA_cudart_static_LIBRARY -- Statically linkable cuda runtime library.
+#                                 Only available for CUDA version 5.5+
+#   CUDA_cudadevrt_LIBRARY -- Device runtime library.
+#                             Required for separable compilation.
+#   CUDA_cupti_LIBRARY    -- CUDA Profiling Tools Interface library.
+#                            Only available for CUDA version 4.0+.
+#   CUDA_curand_LIBRARY   -- CUDA Random Number Generation library.
+#                            Only available for CUDA version 3.2+.
+#   CUDA_cusolver_LIBRARY -- CUDA Direct Solver library.
+#                            Only available for CUDA version 7.0+.
+#   CUDA_cusparse_LIBRARY -- CUDA Sparse Matrix library.
+#                            Only available for CUDA version 3.2+.
+#   CUDA_npp_LIBRARY      -- NVIDIA Performance Primitives lib.
+#                            Only available for CUDA version 4.0+.
+#   CUDA_nppc_LIBRARY     -- NVIDIA Performance Primitives lib (core).
+#                            Only available for CUDA version 5.5+.
+#   CUDA_nppi_LIBRARY     -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 5.5+.
+#   CUDA_npps_LIBRARY     -- NVIDIA Performance Primitives lib (signal processing).
+#                            Only available for CUDA version 5.5+.
+#   CUDA_nvcuvenc_LIBRARY -- CUDA Video Encoder library.
+#                            Only available for CUDA version 3.2+.
+#                            Windows only.
+#   CUDA_nvcuvid_LIBRARY  -- CUDA Video Decoder library.
+#                            Only available for CUDA version 3.2+.
+#                            Windows only.
+#
+
+#   James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#   Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#   Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#   Copyright (c) 2007-2009
+#   Scientific Computing and Imaging Institute, University of Utah
+#
+#   This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#   for the text of the license.
 
 # The MIT License
 #
@@ -313,11 +372,6 @@
 
 # FindCUDA.cmake
 
-# We need to have at least this version to support the VERSION_LESS argument to 'if' (2.6.2) and unset (2.6.3)
-cmake_policy(PUSH)
-cmake_minimum_required(VERSION 2.6.3)
-cmake_policy(POP)
-
 # This macro helps us find the location of helper files we will need the full path to
 macro(CUDA_FIND_HELPER_FILE _name _extension)
   set(_full_name "${_name}.${_extension}")
@@ -440,7 +494,31 @@ set(CUDA_NVCC_FLAGS "" CACHE STRING "Semi-colon delimit multiple arguments.")
 if(CMAKE_GENERATOR MATCHES "Visual Studio")
   set(CUDA_HOST_COMPILER "$(VCInstallDir)bin" CACHE FILEPATH "Host side compiler used by NVCC")
 else()
-  set(CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}" CACHE FILEPATH "Host side compiler used by NVCC")
+  if(APPLE
+      AND "${CMAKE_C_COMPILER_ID}" MATCHES "Clang"
+      AND "${CMAKE_C_COMPILER}" MATCHES "/cc$")
+    # Using cc which is symlink to clang may let NVCC think it is GCC and issue
+    # unhandled -dumpspecs option to clang. Also in case neither
+    # CMAKE_C_COMPILER is defined (project does not use C language) nor
+    # CUDA_HOST_COMPILER is specified manually we should skip -ccbin and let
+    # nvcc use its own default C compiler.
+    # Only care about this on APPLE with clang to avoid
+    # following symlinks to things like ccache
+    if(DEFINED CMAKE_C_COMPILER AND NOT DEFINED CUDA_HOST_COMPILER)
+      get_filename_component(c_compiler_realpath "${CMAKE_C_COMPILER}" REALPATH)
+      # if the real path does not end up being clang then
+      # go back to using CMAKE_C_COMPILER
+      if(NOT "${c_compiler_realpath}" MATCHES "/clang$")
+        set(c_compiler_realpath "${CMAKE_C_COMPILER}")
+      endif()
+    else()
+      set(c_compiler_realpath "")
+    endif()
+    set(CUDA_HOST_COMPILER "${c_compiler_realpath}" CACHE FILEPATH "Host side compiler used by NVCC")
+  else()
+    set(CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}"
+      CACHE FILEPATH "Host side compiler used by NVCC")
+  endif()
 endif()
 
 # Propagate the host flags to the host compiler via -Xcompiler
@@ -459,6 +537,10 @@ mark_as_advanced(
   CUDA_HOST_COMPILATION_CPP
   CUDA_NVCC_FLAGS
   CUDA_PROPAGATE_HOST_FLAGS
+  CUDA_BUILD_CUBIN
+  CUDA_BUILD_EMULATION
+  CUDA_VERBOSE_BUILD
+  CUDA_SEPARABLE_COMPILATION
   )
 
 # Makefile and similar generators don't define CMAKE_CONFIGURATION_TYPES, so we
@@ -481,25 +563,25 @@ endforeach()
 ###############################################################################
 ###############################################################################
 
-# Check to see if the CUDA_TOOLKIT_ROOT_DIR and CUDA_SDK_ROOT_DIR have changed,
-# if they have then clear the cache variables, so that will be detected again.
-if(NOT "${CUDA_TOOLKIT_ROOT_DIR}" STREQUAL "${CUDA_TOOLKIT_ROOT_DIR_INTERNAL}")
-  unset(CUDA_NVCC_EXECUTABLE CACHE)
+macro(cuda_unset_include_and_libraries)
   unset(CUDA_TOOLKIT_INCLUDE CACHE)
   unset(CUDA_CUDART_LIBRARY CACHE)
+  unset(CUDA_CUDA_LIBRARY CACHE)
   # Make sure you run this before you unset CUDA_VERSION.
   if(CUDA_VERSION VERSION_EQUAL "3.0")
     # This only existed in the 3.0 version of the CUDA toolkit
     unset(CUDA_CUDARTEMU_LIBRARY CACHE)
   endif()
-  unset(CUDA_VERSION CACHE)
-  unset(CUDA_CUDA_LIBRARY CACHE)
-  unset(CUDA_cupti_LIBRARY CACHE)
+  unset(CUDA_cudart_static_LIBRARY CACHE)
+  unset(CUDA_cudadevrt_LIBRARY CACHE)
   unset(CUDA_cublas_LIBRARY CACHE)
+  unset(CUDA_cublas_device_LIBRARY CACHE)
   unset(CUDA_cublasemu_LIBRARY CACHE)
   unset(CUDA_cufft_LIBRARY CACHE)
   unset(CUDA_cufftemu_LIBRARY CACHE)
+  unset(CUDA_cupti_LIBRARY CACHE)
   unset(CUDA_curand_LIBRARY CACHE)
+  unset(CUDA_cusolver_LIBRARY CACHE)
   unset(CUDA_cusparse_LIBRARY CACHE)
   unset(CUDA_npp_LIBRARY CACHE)
   unset(CUDA_nppc_LIBRARY CACHE)
@@ -507,33 +589,50 @@ if(NOT "${CUDA_TOOLKIT_ROOT_DIR}" STREQUAL "${CUDA_TOOLKIT_ROOT_DIR_INTERNAL}")
   unset(CUDA_npps_LIBRARY CACHE)
   unset(CUDA_nvcuvenc_LIBRARY CACHE)
   unset(CUDA_nvcuvid_LIBRARY CACHE)
-endif()
+  unset(CUDA_USE_STATIC_CUDA_RUNTIME CACHE)
+  unset(CUDA_GPU_DETECT_OUTPUT CACHE)
+endmacro()
 
-if(NOT "${CUDA_SDK_ROOT_DIR}" STREQUAL "${CUDA_SDK_ROOT_DIR_INTERNAL}")
-  # No specific variables to catch.  Use this kind of code before calling
-  # find_package(CUDA) to clean up any variables that may depend on this path.
+# Check to see if the CUDA_TOOLKIT_ROOT_DIR and CUDA_SDK_ROOT_DIR have changed,
+# if they have then clear the cache variables, so that will be detected again.
+if(NOT "${CUDA_TOOLKIT_ROOT_DIR}" STREQUAL "${CUDA_TOOLKIT_ROOT_DIR_INTERNAL}")
+  unset(CUDA_TOOLKIT_TARGET_DIR CACHE)
+  unset(CUDA_NVCC_EXECUTABLE CACHE)
+  cuda_unset_include_and_libraries()
+  unset(CUDA_VERSION CACHE)
+endif()
 
-  #   unset(MY_SPECIAL_CUDA_SDK_INCLUDE_DIR CACHE)
-  #   unset(MY_SPECIAL_CUDA_SDK_LIBRARY CACHE)
+if(NOT "${CUDA_TOOLKIT_TARGET_DIR}" STREQUAL "${CUDA_TOOLKIT_TARGET_DIR_INTERNAL}")
+  cuda_unset_include_and_libraries()
 endif()
 
-# Search for the cuda distribution.
-if(NOT CUDA_TOOLKIT_ROOT_DIR)
+#
+#  End of unset()
+#
+
+#
+#  Start looking for things
+#
 
+# Search for the cuda distribution.
+if(NOT CUDA_TOOLKIT_ROOT_DIR AND NOT CMAKE_CROSSCOMPILING)
   # Search in the CUDA_BIN_PATH first.
   find_path(CUDA_TOOLKIT_ROOT_DIR
     NAMES nvcc nvcc.exe
     PATHS
+      ENV CUDA_TOOLKIT_ROOT
       ENV CUDA_PATH
       ENV CUDA_BIN_PATH
     PATH_SUFFIXES bin bin64
     DOC "Toolkit location."
     NO_DEFAULT_PATH
     )
+
   # Now search default paths
   find_path(CUDA_TOOLKIT_ROOT_DIR
     NAMES nvcc nvcc.exe
-    PATHS /usr/local/bin
+    PATHS /opt/cuda/bin
+          /usr/local/bin
           /usr/local/cuda/bin
     DOC "Toolkit location."
     )
@@ -542,7 +641,9 @@ if(NOT CUDA_TOOLKIT_ROOT_DIR)
     string(REGEX REPLACE "[/\\\\]?bin[64]*[/\\\\]?$" "" CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR})
     # We need to force this back into the cache.
     set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR} CACHE PATH "Toolkit location." FORCE)
+    set(CUDA_TOOLKIT_TARGET_DIR ${CUDA_TOOLKIT_ROOT_DIR})
   endif()
+
   if (NOT EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
     if(CUDA_FIND_REQUIRED)
       message(FATAL_ERROR "Specify CUDA_TOOLKIT_ROOT_DIR")
@@ -552,8 +653,45 @@ if(NOT CUDA_TOOLKIT_ROOT_DIR)
   endif ()
 endif ()
 
+if(CMAKE_CROSSCOMPILING)
+  SET (CUDA_TOOLKIT_ROOT $ENV{CUDA_TOOLKIT_ROOT})
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a")
+    # Support for NVPACK
+    set (CUDA_TOOLKIT_TARGET_NAME "armv7-linux-androideabi")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+    # Support for arm cross compilation
+    set(CUDA_TOOLKIT_TARGET_NAME "armv7-linux-gnueabihf")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    # Support for aarch64 cross compilation
+    if (ANDROID_ARCH_NAME STREQUAL "arm64")
+      set(CUDA_TOOLKIT_TARGET_NAME "aarch64-linux-androideabi")
+    else()
+      set(CUDA_TOOLKIT_TARGET_NAME "aarch64-linux")
+    endif (ANDROID_ARCH_NAME STREQUAL "arm64")
+  endif()
+
+  if (EXISTS "${CUDA_TOOLKIT_ROOT}/targets/${CUDA_TOOLKIT_TARGET_NAME}")
+    set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT}/targets/${CUDA_TOOLKIT_TARGET_NAME}" CACHE PATH "CUDA Toolkit target location.")
+    SET (CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT})
+    mark_as_advanced(CUDA_TOOLKIT_TARGET_DIR)
+  endif()
+
+  # add known CUDA targetr root path to the set of directories we search for programs, libraries and headers
+  set( CMAKE_FIND_ROOT_PATH "${CUDA_TOOLKIT_TARGET_DIR};${CMAKE_FIND_ROOT_PATH}")
+  macro( cuda_find_host_program )
+    find_host_program( ${ARGN} )
+  endmacro()
+else()
+  # for non-cross-compile, find_host_program == find_program and CUDA_TOOLKIT_TARGET_DIR == CUDA_TOOLKIT_ROOT_DIR
+  macro( cuda_find_host_program )
+    find_program( ${ARGN} )
+  endmacro()
+  SET (CUDA_TOOLKIT_TARGET_DIR ${CUDA_TOOLKIT_ROOT_DIR})
+endif()
+
+
 # CUDA_NVCC_EXECUTABLE
-find_program(CUDA_NVCC_EXECUTABLE
+cuda_find_host_program(CUDA_NVCC_EXECUTABLE
   NAMES nvcc
   PATHS "${CUDA_TOOLKIT_ROOT_DIR}"
   ENV CUDA_PATH
@@ -562,7 +700,7 @@ find_program(CUDA_NVCC_EXECUTABLE
   NO_DEFAULT_PATH
   )
 # Search default search paths, after we search our own set of paths.
-find_program(CUDA_NVCC_EXECUTABLE nvcc)
+cuda_find_host_program(CUDA_NVCC_EXECUTABLE nvcc)
 mark_as_advanced(CUDA_NVCC_EXECUTABLE)
 
 if(CUDA_NVCC_EXECUTABLE AND NOT CUDA_VERSION)
@@ -578,13 +716,14 @@ else()
   string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR "${CUDA_VERSION}")
 endif()
 
+
 # Always set this convenience variable
 set(CUDA_VERSION_STRING "${CUDA_VERSION}")
 
 # CUDA_TOOLKIT_INCLUDE
 find_path(CUDA_TOOLKIT_INCLUDE
   device_functions.h # Header included in toolkit
-  PATHS "${CUDA_TOOLKIT_ROOT_DIR}"
+  PATHS ${CUDA_TOOLKIT_TARGET_DIR}
   ENV CUDA_PATH
   ENV CUDA_INC_PATH
   PATH_SUFFIXES include
@@ -594,8 +733,14 @@ find_path(CUDA_TOOLKIT_INCLUDE
 find_path(CUDA_TOOLKIT_INCLUDE device_functions.h)
 mark_as_advanced(CUDA_TOOLKIT_INCLUDE)
 
+if (CUDA_VERSION VERSION_GREATER "7.0" OR EXISTS "${CUDA_TOOLKIT_INCLUDE}/cuda_fp16.h")
+  set(CUDA_HAS_FP16 TRUE)
+else()
+  set(CUDA_HAS_FP16 FALSE)
+endif()
+
 # Set the user list of include dir to nothing to initialize it.
-set (CUDA_NVCC_INCLUDE_ARGS_USER "")
+set (CUDA_NVCC_INCLUDE_DIRS_USER "")
 set (CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
 
 macro(cuda_find_library_local_first_with_path_ext _var _names _doc _path_ext )
@@ -608,19 +753,21 @@ macro(cuda_find_library_local_first_with_path_ext _var _names _doc _path_ext )
   # (lib/Win32) and the old path (lib).
   find_library(${_var}
     NAMES ${_names}
-    PATHS "${CUDA_TOOLKIT_ROOT_DIR}"
+    PATHS "${CUDA_TOOLKIT_TARGET_DIR}"
     ENV CUDA_PATH
     ENV CUDA_LIB_PATH
     PATH_SUFFIXES ${_cuda_64bit_lib_dir} "${_path_ext}lib/Win32" "${_path_ext}lib" "${_path_ext}libWin32"
     DOC ${_doc}
     NO_DEFAULT_PATH
     )
-  # Search default search paths, after we search our own set of paths.
-  find_library(${_var}
-    NAMES ${_names}
-    PATHS "/usr/lib/nvidia-current"
-    DOC ${_doc}
-    )
+  if (NOT CMAKE_CROSSCOMPILING)
+    # Search default search paths, after we search our own set of paths.
+    find_library(${_var}
+      NAMES ${_names}
+      PATHS "/usr/lib/nvidia-current"
+      DOC ${_doc}
+      )
+  endif()
 endmacro()
 
 macro(cuda_find_library_local_first _var _names _doc)
@@ -642,30 +789,89 @@ if(CUDA_VERSION VERSION_EQUAL "3.0")
     )
 endif()
 
+if(NOT CUDA_VERSION VERSION_LESS "5.5")
+  cuda_find_library_local_first(CUDA_cudart_static_LIBRARY cudart_static "static CUDA runtime library")
+  mark_as_advanced(CUDA_cudart_static_LIBRARY)
+endif()
+
+
+if(CUDA_cudart_static_LIBRARY)
+  # If static cudart available, use it by default, but provide a user-visible option to disable it.
+  option(CUDA_USE_STATIC_CUDA_RUNTIME "Use the static version of the CUDA runtime library if available" ON)
+  set(CUDA_CUDART_LIBRARY_VAR CUDA_cudart_static_LIBRARY)
+else()
+  # If not available, silently disable the option.
+  set(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE INTERNAL "")
+  set(CUDA_CUDART_LIBRARY_VAR CUDA_CUDART_LIBRARY)
+endif()
+if(NOT CUDA_VERSION VERSION_LESS "5.0")
+  cuda_find_library_local_first(CUDA_cudadevrt_LIBRARY cudadevrt "\"cudadevrt\" library")
+  mark_as_advanced(CUDA_cudadevrt_LIBRARY)
+endif()
+
+if(CUDA_USE_STATIC_CUDA_RUNTIME)
+  if(UNIX)
+    # Check for the dependent libraries.  Here we look for pthreads.
+    if (DEFINED CMAKE_THREAD_PREFER_PTHREAD)
+      set(_cuda_cmake_thread_prefer_pthread ${CMAKE_THREAD_PREFER_PTHREAD})
+    endif()
+    set(CMAKE_THREAD_PREFER_PTHREAD 1)
+
+    # Many of the FindXYZ CMake comes with makes use of try_compile with int main(){return 0;}
+    # as the source file.  Unfortunately this causes a warning with -Wstrict-prototypes and
+    # -Werror causes the try_compile to fail.  We will just temporarily disable other flags
+    # when doing the find_package command here.
+    set(_cuda_cmake_c_flags ${CMAKE_C_FLAGS})
+    set(CMAKE_C_FLAGS "-fPIC")
+    find_package(Threads REQUIRED)
+    set(CMAKE_C_FLAGS ${_cuda_cmake_c_flags})
+
+    if (DEFINED _cuda_cmake_thread_prefer_pthread)
+      set(CMAKE_THREAD_PREFER_PTHREAD ${_cuda_cmake_thread_prefer_pthread})
+      unset(_cuda_cmake_thread_prefer_pthread)
+    else()
+      unset(CMAKE_THREAD_PREFER_PTHREAD)
+    endif()
+
+    if(NOT APPLE)
+      #On Linux, you must link against librt when using the static cuda runtime.
+      find_library(CUDA_rt_LIBRARY rt)
+      if (NOT CUDA_rt_LIBRARY)
+        message(WARNING "Expecting to find librt for libcudart_static, but didn't find it.")
+      endif()
+    endif()
+  endif()
+endif()
+
 # CUPTI library showed up in cuda toolkit 4.0
 if(NOT CUDA_VERSION VERSION_LESS "4.0")
   cuda_find_library_local_first_with_path_ext(CUDA_cupti_LIBRARY cupti "\"cupti\" library" "extras/CUPTI/")
   mark_as_advanced(CUDA_cupti_LIBRARY)
 endif()
 
+# Set the CUDA_LIBRARIES variable.  This is the set of stuff to link against if you are
+# using the CUDA runtime.  For the dynamic version of the runtime, most of the
+# dependencies are brough in, but for the static version there are additional libraries
+# and linker commands needed.
+# Initialize to empty
+set(CUDA_LIBRARIES)
+
 # If we are using emulation mode and we found the cudartemu library then use
 # that one instead of cudart.
 if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY)
-  set(CUDA_LIBRARIES ${CUDA_CUDARTEMU_LIBRARY})
-else()
-  set(CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY})
-endif()
-if(APPLE)
-  # We need to add the path to cudart to the linker using rpath, since the
-  # library name for the cuda libraries is prepended with @rpath.
-  if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY)
-    get_filename_component(_cuda_path_to_cudart "${CUDA_CUDARTEMU_LIBRARY}" PATH)
-  else()
-    get_filename_component(_cuda_path_to_cudart "${CUDA_CUDART_LIBRARY}" PATH)
+  list(APPEND CUDA_LIBRARIES ${CUDA_CUDARTEMU_LIBRARY})
+elseif(CUDA_USE_STATIC_CUDA_RUNTIME AND CUDA_cudart_static_LIBRARY)
+  list(APPEND CUDA_LIBRARIES ${CUDA_cudart_static_LIBRARY} ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS})
+  if (CUDA_rt_LIBRARY)
+    list(APPEND CUDA_LIBRARIES ${CUDA_rt_LIBRARY})
   endif()
-  if(_cuda_path_to_cudart)
-    list(APPEND CUDA_LIBRARIES -Wl,-rpath "-Wl,${_cuda_path_to_cudart}")
+  if(APPLE)
+    # We need to add the default path to the driver (libcuda.dylib) as an rpath, so that
+    # the static cuda runtime can find it at runtime.
+    list(APPEND CUDA_LIBRARIES -Wl,-rpath,/usr/local/cuda/lib)
   endif()
+else()
+  list(APPEND CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY})
 endif()
 
 # 1.1 toolkit on linux doesn't appear to have a separate library on
@@ -710,6 +916,7 @@ if(NOT CUDA_VERSION VERSION_LESS "3.2")
   endif()
 endif()
 if(CUDA_VERSION VERSION_GREATER "5.0")
+  find_cuda_helper_libs(cublas_device)
   # In CUDA 5.5 NPP was splitted onto 3 separate libraries.
   find_cuda_helper_libs(nppc)
   find_cuda_helper_libs(nppi)
@@ -718,13 +925,17 @@ if(CUDA_VERSION VERSION_GREATER "5.0")
 elseif(NOT CUDA_VERSION VERSION_LESS "4.0")
   find_cuda_helper_libs(npp)
 endif()
+if(NOT CUDA_VERSION VERSION_LESS "7.0")
+  # cusolver showed up in version 7.0
+  find_cuda_helper_libs(cusolver)
+endif()
 
 if (CUDA_BUILD_EMULATION)
   set(CUDA_CUFFT_LIBRARIES ${CUDA_cufftemu_LIBRARY})
   set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublasemu_LIBRARY})
 else()
   set(CUDA_CUFFT_LIBRARIES ${CUDA_cufft_LIBRARY})
-  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY})
+  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY})
 endif()
 
 ########################
@@ -799,16 +1010,19 @@ set(CUDA_FOUND TRUE)
 
 set(CUDA_TOOLKIT_ROOT_DIR_INTERNAL "${CUDA_TOOLKIT_ROOT_DIR}" CACHE INTERNAL
   "This is the value of the last time CUDA_TOOLKIT_ROOT_DIR was set successfully." FORCE)
+set(CUDA_TOOLKIT_TARGET_DIR_INTERNAL "${CUDA_TOOLKIT_TARGET_DIR}" CACHE INTERNAL
+  "This is the value of the last time CUDA_TOOLKIT_TARGET_DIR was set successfully." FORCE)
 set(CUDA_SDK_ROOT_DIR_INTERNAL "${CUDA_SDK_ROOT_DIR}" CACHE INTERNAL
   "This is the value of the last time CUDA_SDK_ROOT_DIR was set successfully." FORCE)
 
 include(${CMAKE_CURRENT_LIST_DIR}/FindPackageHandleStandardArgs.cmake)
+
 find_package_handle_standard_args(CUDA
   REQUIRED_VARS
     CUDA_TOOLKIT_ROOT_DIR
     CUDA_NVCC_EXECUTABLE
     CUDA_INCLUDE_DIRS
-    CUDA_CUDART_LIBRARY
+    ${CUDA_CUDART_LIBRARY_VAR}
   VERSION_VAR
     CUDA_VERSION
   )
@@ -825,7 +1039,7 @@ find_package_handle_standard_args(CUDA
 # Add include directories to pass to the nvcc command.
 macro(CUDA_INCLUDE_DIRECTORIES)
   foreach(dir ${ARGN})
-    list(APPEND CUDA_NVCC_INCLUDE_ARGS_USER -I${dir})
+    list(APPEND CUDA_NVCC_INCLUDE_DIRS_USER ${dir})
   endforeach()
 endmacro()
 
@@ -834,6 +1048,7 @@ endmacro()
 cuda_find_helper_file(parse_cubin cmake)
 cuda_find_helper_file(make2cmake cmake)
 cuda_find_helper_file(run_nvcc cmake)
+include("${CMAKE_CURRENT_LIST_DIR}/FindCUDA/select_compute_arch.cmake")
 
 ##############################################################################
 # Separate the OPTIONS out from the sources
@@ -844,15 +1059,15 @@ macro(CUDA_GET_SOURCES_AND_OPTIONS _sources _cmake_options _options)
   set( ${_options} )
   set( _found_options FALSE )
   foreach(arg ${ARGN})
-    if(arg STREQUAL "OPTIONS")
+    if("x${arg}" STREQUAL "xOPTIONS")
       set( _found_options TRUE )
     elseif(
-        arg STREQUAL "WIN32" OR
-        arg STREQUAL "MACOSX_BUNDLE" OR
-        arg STREQUAL "EXCLUDE_FROM_ALL" OR
-        arg STREQUAL "STATIC" OR
-        arg STREQUAL "SHARED" OR
-        arg STREQUAL "MODULE"
+        "x${arg}" STREQUAL "xWIN32" OR
+        "x${arg}" STREQUAL "xMACOSX_BUNDLE" OR
+        "x${arg}" STREQUAL "xEXCLUDE_FROM_ALL" OR
+        "x${arg}" STREQUAL "xSTATIC" OR
+        "x${arg}" STREQUAL "xSHARED" OR
+        "x${arg}" STREQUAL "xMODULE"
         )
       list(APPEND ${_cmake_options} ${arg})
     else()
@@ -948,7 +1163,7 @@ function(CUDA_COMPUTE_BUILD_PATH path build_path)
     endif()
   endif()
 
-  # This recipie is from cmLocalGenerator::CreateSafeUniqueObjectFileName in the
+  # This recipe is from cmLocalGenerator::CreateSafeUniqueObjectFileName in the
   # CMake source.
 
   # Remove leading /
@@ -977,7 +1192,7 @@ endfunction()
 # a .cpp or .ptx file.
 # INPUT:
 #   cuda_target         - Target name
-#   format              - PTX or OBJ
+#   format              - PTX, CUBIN, FATBIN or OBJ
 #   FILE1 .. FILEN      - The remaining arguments are the sources to be wrapped.
 #   OPTIONS             - Extra options to NVCC
 # OUTPUT:
@@ -987,6 +1202,18 @@ endfunction()
 
 macro(CUDA_WRAP_SRCS cuda_target format generated_files)
 
+  # Put optional arguments in list.
+  set(_argn_list "${ARGN}")
+  # If one of the given optional arguments is "PHONY", make a note of it, then
+  # remove it from the list.
+  list(FIND _argn_list "PHONY" _phony_idx)
+  if("${_phony_idx}" GREATER "-1")
+    set(_target_is_phony true)
+    list(REMOVE_AT _argn_list ${_phony_idx})
+  else()
+    set(_target_is_phony false)
+  endif()
+
   # If CMake doesn't support separable compilation, complain
   if(CUDA_SEPARABLE_COMPILATION AND CMAKE_VERSION VERSION_LESS "2.8.10.1")
     message(SEND_ERROR "CUDA_SEPARABLE_COMPILATION isn't supported for CMake versions less than 2.8.10.1")
@@ -1023,6 +1250,10 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
     set(nvcc_flags ${nvcc_flags} -m32)
   endif()
 
+  if(CUDA_TARGET_CPU_ARCH)
+    set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH}")
+  endif()
+
   # This needs to be passed in at this stage, because VS needs to fill out the
   # value of VCInstallDir from within VS.  Note that CCBIN is only used if
   # -ccbin or --compiler-bindir isn't used and CUDA_HOST_COMPILER matches
@@ -1044,18 +1275,27 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
   endif()
 
   # Initialize our list of includes with the user ones followed by the CUDA system ones.
-  set(CUDA_NVCC_INCLUDE_ARGS ${CUDA_NVCC_INCLUDE_ARGS_USER} "-I${CUDA_INCLUDE_DIRS}")
-  # Get the include directories for this directory and use them for our nvcc command.
-  # Remove duplicate entries which may be present since include_directories
-  # in CMake >= 2.8.8 does not remove them.
-  get_directory_property(CUDA_NVCC_INCLUDE_DIRECTORIES INCLUDE_DIRECTORIES)
-  list(REMOVE_DUPLICATES CUDA_NVCC_INCLUDE_DIRECTORIES)
-  if(CUDA_NVCC_INCLUDE_DIRECTORIES)
-    foreach(dir ${CUDA_NVCC_INCLUDE_DIRECTORIES})
-      list(APPEND CUDA_NVCC_INCLUDE_ARGS -I${dir})
-    endforeach()
+  set(CUDA_NVCC_INCLUDE_DIRS ${CUDA_NVCC_INCLUDE_DIRS_USER} "${CUDA_INCLUDE_DIRS}")
+  if(_target_is_phony)
+    # If the passed in target name isn't a real target (i.e., this is from a call to one of the
+    # cuda_compile_* functions), need to query directory properties to get include directories
+    # and compile definitions.
+    get_directory_property(_dir_include_dirs INCLUDE_DIRECTORIES)
+    get_directory_property(_dir_compile_defs COMPILE_DEFINITIONS)
+
+    list(APPEND CUDA_NVCC_INCLUDE_DIRS "${_dir_include_dirs}")
+    set(CUDA_NVCC_COMPILE_DEFINITIONS "${_dir_compile_defs}")
+  else()
+    # Append the include directories for this target via generator expression, which is
+    # expanded by the FILE(GENERATE) call below.  This generator expression captures all
+    # include dirs set by the user, whether via directory properties or target properties
+    list(APPEND CUDA_NVCC_INCLUDE_DIRS "$<TARGET_PROPERTY:${cuda_target},INCLUDE_DIRECTORIES>")
+
+    # Do the same thing with compile definitions
+    set(CUDA_NVCC_COMPILE_DEFINITIONS "$<TARGET_PROPERTY:${cuda_target},COMPILE_DEFINITIONS>")
   endif()
 
+
   # Reset these variables
   set(CUDA_WRAP_OPTION_NVCC_FLAGS)
   foreach(config ${CUDA_configuration_types})
@@ -1063,7 +1303,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
     set(CUDA_WRAP_OPTION_NVCC_FLAGS_${config_upper})
   endforeach()
 
-  CUDA_GET_SOURCES_AND_OPTIONS(_cuda_wrap_sources _cuda_wrap_cmake_options _cuda_wrap_options ${ARGN})
+  CUDA_GET_SOURCES_AND_OPTIONS(_cuda_wrap_sources _cuda_wrap_cmake_options _cuda_wrap_options ${_argn_list})
   CUDA_PARSE_NVCC_OPTIONS(CUDA_WRAP_OPTION_NVCC_FLAGS ${_cuda_wrap_options})
 
   # Figure out if we are building a shared library.  BUILD_SHARED_LIBS is
@@ -1122,21 +1362,26 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
         set(_cuda_C_FLAGS "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}")
       endif()
 
-      set(_cuda_host_flags "${_cuda_host_flags}\nset(CMAKE_HOST_FLAGS_${config_upper} ${_cuda_C_FLAGS})")
+      string(APPEND _cuda_host_flags "\nset(CMAKE_HOST_FLAGS_${config_upper} ${_cuda_C_FLAGS})")
     endif()
 
     # Note that if we ever want CUDA_NVCC_FLAGS_<CONFIG> to be string (instead of a list
     # like it is currently), we can remove the quotes around the
     # ${CUDA_NVCC_FLAGS_${config_upper}} variable like the CMAKE_HOST_FLAGS_<CONFIG> variable.
-    set(_cuda_nvcc_flags_config "${_cuda_nvcc_flags_config}\nset(CUDA_NVCC_FLAGS_${config_upper} ${CUDA_NVCC_FLAGS_${config_upper}} ;; ${CUDA_WRAP_OPTION_NVCC_FLAGS_${config_upper}})")
+    string(APPEND _cuda_nvcc_flags_config "\nset(CUDA_NVCC_FLAGS_${config_upper} ${CUDA_NVCC_FLAGS_${config_upper}} ;; ${CUDA_WRAP_OPTION_NVCC_FLAGS_${config_upper}})")
   endforeach()
 
-  # Get the list of definitions from the directory property
-  get_directory_property(CUDA_NVCC_DEFINITIONS COMPILE_DEFINITIONS)
-  if(CUDA_NVCC_DEFINITIONS)
-    foreach(_definition ${CUDA_NVCC_DEFINITIONS})
-      list(APPEND nvcc_flags "-D${_definition}")
-    endforeach()
+  # Process the C++11 flag.  If the host sets the flag, we need to add it to nvcc and
+  # remove it from the host. This is because -Xcompile -std=c++ will choke nvcc (it uses
+  # the C preprocessor).  In order to get this to work correctly, we need to use nvcc's
+  # specific c++11 flag.
+  if( "${_cuda_host_flags}" MATCHES "-std=c\\+\\+11")
+    # Add the c++11 flag to nvcc if it isn't already present.  Note that we only look at
+    # the main flag instead of the configuration specific flags.
+    if( NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std;c\\+\\+11" )
+      list(APPEND nvcc_flags --std c++11)
+    endif()
+    string(REGEX REPLACE "[-]+std=c\\+\\+11" "" _cuda_host_flags "${_cuda_host_flags}")
   endif()
 
   if(_cuda_build_shared_libs)
@@ -1148,27 +1393,39 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
 
   # Iterate over the macro arguments and create custom
   # commands for all the .cu files.
-  foreach(file ${ARGN})
+  foreach(file ${_argn_list})
     # Ignore any file marked as a HEADER_FILE_ONLY
     get_source_file_property(_is_header ${file} HEADER_FILE_ONLY)
-    if(${file} MATCHES ".*\\.cu$" AND NOT _is_header)
+    # Allow per source file overrides of the format.  Also allows compiling non-.cu files.
+    get_source_file_property(_cuda_source_format ${file} CUDA_SOURCE_PROPERTY_FORMAT)
+    if((${file} MATCHES "\\.cu$" OR _cuda_source_format) AND NOT _is_header)
 
-      # Allow per source file overrides of the format.
-      get_source_file_property(_cuda_source_format ${file} CUDA_SOURCE_PROPERTY_FORMAT)
       if(NOT _cuda_source_format)
         set(_cuda_source_format ${format})
       endif()
-
-      if( ${_cuda_source_format} MATCHES "PTX" )
-        set( compile_to_ptx ON )
-      elseif( ${_cuda_source_format} MATCHES "OBJ")
-        set( compile_to_ptx OFF )
+      # If file isn't a .cu file, we need to tell nvcc to treat it as such.
+      if(NOT ${file} MATCHES "\\.cu$")
+        set(cuda_language_flag -x=cu)
       else()
-        message( FATAL_ERROR "Invalid format flag passed to CUDA_WRAP_SRCS for file '${file}': '${_cuda_source_format}'.  Use OBJ or PTX.")
+        set(cuda_language_flag)
       endif()
 
+      if( ${_cuda_source_format} MATCHES "OBJ")
+        set( cuda_compile_to_external_module OFF )
+      else()
+        set( cuda_compile_to_external_module ON )
+        if( ${_cuda_source_format} MATCHES "PTX" )
+          set( cuda_compile_to_external_module_type "ptx" )
+        elseif( ${_cuda_source_format} MATCHES "CUBIN")
+          set( cuda_compile_to_external_module_type "cubin" )
+        elseif( ${_cuda_source_format} MATCHES "FATBIN")
+          set( cuda_compile_to_external_module_type "fatbin" )
+        else()
+          message( FATAL_ERROR "Invalid format flag passed to CUDA_WRAP_SRCS or set with CUDA_SOURCE_PROPERTY_FORMAT file property for file '${file}': '${_cuda_source_format}'.  Use OBJ, PTX, CUBIN or FATBIN.")
+        endif()
+      endif()
 
-      if(compile_to_ptx)
+      if(cuda_compile_to_external_module)
         # Don't use any of the host compilation flags for PTX targets.
         set(CUDA_HOST_FLAGS)
         set(CUDA_NVCC_FLAGS_CONFIG)
@@ -1183,7 +1440,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
       if(CUDA_GENERATED_OUTPUT_DIR)
         set(cuda_compile_output_dir "${CUDA_GENERATED_OUTPUT_DIR}")
       else()
-        if ( compile_to_ptx )
+        if ( cuda_compile_to_external_module )
           set(cuda_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}")
         else()
           set(cuda_compile_output_dir "${cuda_compile_intermediate_directory}")
@@ -1193,10 +1450,10 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
       # Add a custom target to generate a c or ptx file. ######################
 
       get_filename_component( basename ${file} NAME )
-      if( compile_to_ptx )
+      if( cuda_compile_to_external_module )
         set(generated_file_path "${cuda_compile_output_dir}")
-        set(generated_file_basename "${cuda_target}_generated_${basename}.ptx")
-        set(format_flag "-ptx")
+        set(generated_file_basename "${cuda_target}_generated_${basename}.${cuda_compile_to_external_module_type}")
+        set(format_flag "-${cuda_compile_to_external_module_type}")
         file(MAKE_DIRECTORY "${cuda_compile_output_dir}")
       else()
         set(generated_file_path "${cuda_compile_output_dir}/${CMAKE_CFG_INTDIR}")
@@ -1216,10 +1473,11 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
       set(cmake_dependency_file "${cuda_compile_intermediate_directory}/${generated_file_basename}.depend")
       set(NVCC_generated_dependency_file "${cuda_compile_intermediate_directory}/${generated_file_basename}.NVCC-depend")
       set(generated_cubin_file "${generated_file_path}/${generated_file_basename}.cubin.txt")
-      set(custom_target_script "${cuda_compile_intermediate_directory}/${generated_file_basename}.cmake")
+      set(custom_target_script_pregen "${cuda_compile_intermediate_directory}/${generated_file_basename}.cmake.pre-gen")
+      set(custom_target_script "${cuda_compile_intermediate_directory}/${generated_file_basename}$<$<BOOL:$<CONFIG>>:.$<CONFIG>>.cmake")
 
       # Setup properties for obj files:
-      if( NOT compile_to_ptx )
+      if( NOT cuda_compile_to_external_module )
         set_source_files_properties("${generated_file}"
           PROPERTIES
           EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked.
@@ -1234,7 +1492,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
         set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
       endif()
 
-      if( NOT compile_to_ptx AND CUDA_SEPARABLE_COMPILATION)
+      if( NOT cuda_compile_to_external_module AND CUDA_SEPARABLE_COMPILATION)
         list(APPEND ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS "${generated_file}")
       endif()
 
@@ -1251,13 +1509,17 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
       # Build the NVCC made dependency file ###################################
       set(build_cubin OFF)
       if ( NOT CUDA_BUILD_EMULATION AND CUDA_BUILD_CUBIN )
-         if ( NOT compile_to_ptx )
+         if ( NOT cuda_compile_to_external_module )
            set ( build_cubin ON )
          endif()
       endif()
 
       # Configure the build script
-      configure_file("${CUDA_run_nvcc}" "${custom_target_script}" @ONLY)
+      configure_file("${CUDA_run_nvcc}" "${custom_target_script_pregen}" @ONLY)
+      file(GENERATE
+        OUTPUT "${custom_target_script}"
+        INPUT "${custom_target_script_pregen}"
+        )
 
       # So if a user specifies the same cuda file as input more than once, you
       # can have bad things happen with dependencies.  Here we check an option
@@ -1278,12 +1540,17 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
 
       # Create up the comment string
       file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}")
-      if(compile_to_ptx)
-        set(cuda_build_comment_string "Building NVCC ptx file ${generated_file_relative_path}")
+      if(cuda_compile_to_external_module)
+        set(cuda_build_comment_string "Building NVCC ${cuda_compile_to_external_module_type} file ${generated_file_relative_path}")
       else()
         set(cuda_build_comment_string "Building NVCC (${cuda_build_type}) object ${generated_file_relative_path}")
       endif()
 
+      set(_verbatim VERBATIM)
+      if(ccbin_flags MATCHES "\\$\\(VCInstallDir\\)")
+        set(_verbatim "")
+      endif()
+
       # Build the generated file and dependency file ##########################
       add_custom_command(
         OUTPUT ${generated_file}
@@ -1302,6 +1569,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
           -P "${custom_target_script}"
         WORKING_DIRECTORY "${cuda_compile_intermediate_directory}"
         COMMENT "${cuda_build_comment_string}"
+        ${_verbatim}
         )
 
       # Make sure the build system knows the file is generated.
@@ -1323,10 +1591,10 @@ endmacro()
 
 function(_cuda_get_important_host_flags important_flags flag_string)
   if(CMAKE_GENERATOR MATCHES "Visual Studio")
-    string(REGEX MATCHALL "/M[DT][d]?" flags ${flag_string})
+    string(REGEX MATCHALL "/M[DT][d]?" flags "${flag_string}")
     list(APPEND ${important_flags} ${flags})
   else()
-    string(REGEX MATCHALL "-fPIC" flags ${flag_string})
+    string(REGEX MATCHALL "-fPIC" flags "${flag_string}")
     list(APPEND ${important_flags} ${flags})
   endif()
   set(${important_flags} ${${important_flags}} PARENT_SCOPE)
@@ -1372,18 +1640,40 @@ function(CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS output_file cuda_target options
     # If -ccbin, --compiler-bindir has been specified, don't do anything.  Otherwise add it here.
     list( FIND nvcc_flags "-ccbin" ccbin_found0 )
     list( FIND nvcc_flags "--compiler-bindir" ccbin_found1 )
-    if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
-      list(APPEND nvcc_flags -ccbin "\"${CUDA_HOST_COMPILER}\"")
+    if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 AND CUDA_HOST_COMPILER )
+      # Match VERBATIM check below.
+      if(CUDA_HOST_COMPILER MATCHES "\\$\\(VCInstallDir\\)")
+        list(APPEND nvcc_flags -ccbin "\"${CUDA_HOST_COMPILER}\"")
+      else()
+        list(APPEND nvcc_flags -ccbin "${CUDA_HOST_COMPILER}")
+      endif()
     endif()
+
+    # Create a list of flags specified by CUDA_NVCC_FLAGS_${CONFIG} and CMAKE_${CUDA_C_OR_CXX}_FLAGS*
+    set(config_specific_flags)
     set(flags)
     foreach(config ${CUDA_configuration_types})
       string(TOUPPER ${config} config_upper)
+      # Add config specific flags
+      foreach(f ${CUDA_NVCC_FLAGS_${config_upper}})
+        list(APPEND config_specific_flags $<$<CONFIG:${config}>:${f}>)
+      endforeach()
       set(important_host_flags)
-      _cuda_get_important_host_flags(important_host_flags ${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}})
+      _cuda_get_important_host_flags(important_host_flags "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}")
       foreach(f ${important_host_flags})
         list(APPEND flags $<$<CONFIG:${config}>:-Xcompiler> $<$<CONFIG:${config}>:${f}>)
       endforeach()
     endforeach()
+    # Add CMAKE_${CUDA_C_OR_CXX}_FLAGS
+    set(important_host_flags)
+    _cuda_get_important_host_flags(important_host_flags "${CMAKE_${CUDA_C_OR_CXX}_FLAGS}")
+    foreach(f ${important_host_flags})
+      list(APPEND flags -Xcompiler ${f})
+    endforeach()
+
+    # Add our general CUDA_NVCC_FLAGS with the configuration specifig flags
+    set(nvcc_flags ${CUDA_NVCC_FLAGS} ${config_specific_flags} ${nvcc_flags})
+
     file(RELATIVE_PATH output_file_relative_path "${CMAKE_BINARY_DIR}" "${output_file}")
 
     # Some generators don't handle the multiple levels of custom command
@@ -1391,12 +1681,16 @@ function(CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS output_file cuda_target options
     # we work around that issue by compiling the intermediate link object as a
     # pre-link custom command in that situation.
     set(do_obj_build_rule TRUE)
-    if (MSVC_VERSION GREATER 1599)
-      # VS 2010 and 2012 have this problem.  If future versions fix this issue,
-      # it should still work, it just won't be as nice as the other method.
+    if (MSVC_VERSION GREATER 1599 AND MSVC_VERSION LESS 1800)
+      # VS 2010 and 2012 have this problem.
       set(do_obj_build_rule FALSE)
     endif()
 
+    set(_verbatim VERBATIM)
+    if(nvcc_flags MATCHES "\\$\\(VCInstallDir\\)")
+      set(_verbatim "")
+    endif()
+
     if (do_obj_build_rule)
       add_custom_command(
         OUTPUT ${output_file}
@@ -1404,13 +1698,17 @@ function(CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS output_file cuda_target options
         COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} -dlink ${object_files} -o ${output_file}
         ${flags}
         COMMENT "Building NVCC intermediate link file ${output_file_relative_path}"
+        ${_verbatim}
         )
     else()
+      get_filename_component(output_file_dir "${output_file}" DIRECTORY)
       add_custom_command(
         TARGET ${cuda_target}
         PRE_LINK
         COMMAND ${CMAKE_COMMAND} -E echo "Building NVCC intermediate link file ${output_file_relative_path}"
+        COMMAND ${CMAKE_COMMAND} -E make_directory "${output_file_dir}"
         COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} ${flags} -dlink ${object_files} -o "${output_file}"
+        ${_verbatim}
         )
     endif()
  endif()
@@ -1449,10 +1747,16 @@ macro(CUDA_ADD_LIBRARY cuda_target)
   # variable will have been defined.
   CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
 
-  target_link_libraries(${cuda_target}
+  target_link_libraries(${cuda_target} ${CUDA_LINK_LIBRARIES_KEYWORD}
     ${CUDA_LIBRARIES}
     )
 
+  if(CUDA_SEPARABLE_COMPILATION)
+    target_link_libraries(${cuda_target} ${CUDA_LINK_LIBRARIES_KEYWORD}
+      ${CUDA_cudadevrt_LIBRARY}
+      )
+  endif()
+
   # We need to set the linker language based on what the expected generated file
   # would be. CUDA_C_OR_CXX is computed based on CUDA_HOST_COMPILATION_CPP.
   set_target_properties(${cuda_target}
@@ -1493,7 +1797,7 @@ macro(CUDA_ADD_EXECUTABLE cuda_target)
   # variable will have been defined.
   CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
 
-  target_link_libraries(${cuda_target}
+  target_link_libraries(${cuda_target} ${CUDA_LINK_LIBRARIES_KEYWORD}
     ${CUDA_LIBRARIES}
     )
 
@@ -1509,21 +1813,40 @@ endmacro()
 
 ###############################################################################
 ###############################################################################
-# CUDA COMPILE
+# (Internal) helper for manually added cuda source files with specific targets
 ###############################################################################
 ###############################################################################
-macro(CUDA_COMPILE generated_files)
+macro(cuda_compile_base cuda_target format generated_files)
+  # Update a counter in this directory, to keep phony target names unique.
+  set(_cuda_target "${cuda_target}")
+  get_property(_counter DIRECTORY PROPERTY _cuda_internal_phony_counter)
+  if(_counter)
+    math(EXPR _counter "${_counter} + 1")
+  else()
+    set(_counter 1)
+  endif()
+  string(APPEND _cuda_target "_${_counter}")
+  set_property(DIRECTORY PROPERTY _cuda_internal_phony_counter ${_counter})
 
   # Separate the sources from the options
   CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
+
   # Create custom commands and targets for each file.
-  CUDA_WRAP_SRCS( cuda_compile OBJ _generated_files ${_sources} ${_cmake_options}
-    OPTIONS ${_options} )
+  CUDA_WRAP_SRCS( ${_cuda_target} ${format} _generated_files ${_sources}
+                  ${_cmake_options} OPTIONS ${_options} PHONY)
 
   set( ${generated_files} ${_generated_files})
 
 endmacro()
 
+###############################################################################
+###############################################################################
+# CUDA COMPILE
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE generated_files)
+  cuda_compile_base(cuda_compile OBJ ${generated_files} ${ARGN})
+endmacro()
 
 ###############################################################################
 ###############################################################################
@@ -1531,17 +1854,28 @@ endmacro()
 ###############################################################################
 ###############################################################################
 macro(CUDA_COMPILE_PTX generated_files)
+  cuda_compile_base(cuda_compile_ptx PTX ${generated_files} ${ARGN})
+endmacro()
 
-  # Separate the sources from the options
-  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
-  # Create custom commands and targets for each file.
-  CUDA_WRAP_SRCS( cuda_compile_ptx PTX _generated_files ${_sources} ${_cmake_options}
-    OPTIONS ${_options} )
-
-  set( ${generated_files} ${_generated_files})
+###############################################################################
+###############################################################################
+# CUDA COMPILE FATBIN
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_FATBIN generated_files)
+  cuda_compile_base(cuda_compile_fatbin FATBIN ${generated_files} ${ARGN})
+endmacro()
 
+###############################################################################
+###############################################################################
+# CUDA COMPILE CUBIN
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_CUBIN generated_files)
+  cuda_compile_base(cuda_compile_cubin CUBIN ${generated_files} ${ARGN})
 endmacro()
 
+
 ###############################################################################
 ###############################################################################
 # CUDA ADD CUFFT TO TARGET
@@ -1549,9 +1883,9 @@ endmacro()
 ###############################################################################
 macro(CUDA_ADD_CUFFT_TO_TARGET target)
   if (CUDA_BUILD_EMULATION)
-    target_link_libraries(${target} ${CUDA_cufftemu_LIBRARY})
+    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cufftemu_LIBRARY})
   else()
-    target_link_libraries(${target} ${CUDA_cufft_LIBRARY})
+    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cufft_LIBRARY})
   endif()
 endmacro()
 
@@ -1562,9 +1896,9 @@ endmacro()
 ###############################################################################
 macro(CUDA_ADD_CUBLAS_TO_TARGET target)
   if (CUDA_BUILD_EMULATION)
-    target_link_libraries(${target} ${CUDA_cublasemu_LIBRARY})
+    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cublasemu_LIBRARY})
   else()
-    target_link_libraries(${target} ${CUDA_cublas_LIBRARY})
+    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY})
   endif()
 endmacro()