diff options
author | bddppq <bai@in.tum.de> | 2018-12-13 15:41:55 -0800 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2018-12-13 15:43:57 -0800 |
commit | de0784510d06504d0825112e003370070ecdcd7d (patch) | |
tree | ee77102656c4344b10382c53a949c44b384d94f4 | |
parent | 855d9e1f19d69e5b3963a2ec7ac3cf0fc31120d9 (diff) | |
download | pytorch-de0784510d06504d0825112e003370070ecdcd7d.tar.gz pytorch-de0784510d06504d0825112e003370070ecdcd7d.tar.bz2 pytorch-de0784510d06504d0825112e003370070ecdcd7d.zip |
Remove disabled_features in hipify
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/15098
Reviewed By: ezyang
Differential Revision: D13453762
Pulled By: bddppq
fbshipit-source-id: e177042c78f5bf393163d660c25b80285353853d
-rw-r--r-- | aten/src/ATen/native/Distributions.cpp | 14 | ||||
-rw-r--r-- | aten/src/ATen/native/Distributions.h | 92 | ||||
-rw-r--r-- | aten/src/ATen/native/cuda/Distributions.cu | 15 | ||||
-rw-r--r-- | aten/src/THC/THCBlas.cu | 26 | ||||
-rw-r--r-- | aten/src/THC/THCGenerator.hpp | 4 | ||||
-rw-r--r-- | aten/src/THC/THCTensorRandom.cpp | 2 | ||||
-rw-r--r-- | aten/src/THC/THCTensorRandom.cu | 16 | ||||
-rw-r--r-- | aten/src/THC/THCTensorRandom.h | 8 | ||||
-rw-r--r-- | aten/src/THCUNN/generic/RReLU.cu | 2 | ||||
-rw-r--r-- | c10/cuda/CUDAMathCompat.h | 47 | ||||
-rw-r--r-- | test/test_distributions.py | 1 | ||||
-rw-r--r-- | tools/amd_build/disabled_features.json | 127 | ||||
-rw-r--r-- | tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py | 8 |
13 files changed, 165 insertions, 197 deletions
diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp index c81554d69d..30ffb76c30 100644 --- a/aten/src/ATen/native/Distributions.cpp +++ b/aten/src/ATen/native/Distributions.cpp @@ -209,13 +209,17 @@ Tensor _s_gamma_cpu(const Tensor& alpha, Generator *gen) { std::lock_guard<std::mutex> lock(generator->mutex); CPU_tensor_apply2<scalar_t, scalar_t>(ret, alpha, [generator](scalar_t& ret_val, const scalar_t& alpha){ - BaseSampler<double> standard_uniform([generator] () { + + auto uniform_lambda = [generator] () { return THRandom_standard_uniform(generator); - }); - BaseSampler<double> standard_normal([generator] () { + }; + BaseSampler<double, decltype(uniform_lambda)> standard_uniform(uniform_lambda); + + auto normal_lambda = [generator] () { return THRandom_normal(generator, 0.0, 1.0); - }); - auto sample = sample_gamma<scalar_t, double>(alpha, standard_uniform, standard_normal); + }; + BaseSampler<double, decltype(normal_lambda)> standard_normal(normal_lambda); + auto sample = sample_gamma<scalar_t, double, decltype(uniform_lambda), decltype(normal_lambda)>(alpha, standard_uniform, standard_normal); ret_val = std::max(std::numeric_limits<scalar_t>::min(), (scalar_t) sample); } ); diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h index 0fe382a73c..31167d5907 100644 --- a/aten/src/ATen/native/Distributions.h +++ b/aten/src/ATen/native/Distributions.h @@ -1,9 +1,6 @@ #pragma once #include <TH/THMath.h> -#ifdef __CUDA_ARCH__ -#include <nvfunctional> -#endif #include <ATen/ATen.h> #include <ATen/CPUGenerator.h> @@ -11,6 +8,8 @@ #include <ATen/Generator.h> #include <TH/THGenerator.hpp> +#include <c10/macros/Macros.h> + namespace at {namespace native { static inline THGenerator* get_generator(at::Generator* gen) { @@ -21,24 +20,41 @@ static inline THGenerator* get_generator(at::Generator* gen) { }} // namespace at::native +// ROCM hcc doesn't work well with using std:: in kernel functions +#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__) +#include <c10/cuda/CUDAMathCompat.h> +#define compat_exp c10::cuda::compat::exp +#define compat_floor c10::cuda::compat::floor +#define compat_log c10::cuda::compat::log +#define compat_pow c10::cuda::compat::pow +#define compat_sqrt c10::cuda::compat::sqrt +#define compat_tan c10::cuda::compat::tan +#else +#define compat_exp std::exp +#define compat_floor std::floor +#define compat_log std::log +#define compat_pow std::pow +#define compat_sqrt std::sqrt +#define compat_tan std::tan +#endif + namespace { -#ifdef __CUDA_ARCH__ -#define nvfunction_or_function nvstd::function -#define deviceforcuda __device__ -#else -#define nvfunction_or_function std::function -#define deviceforcuda +#if !defined(__CUDA_ARCH__) && !defined(__HIP_PLATFORM_HCC__) // we cannot use std::isnan directly due to some incompatibility of // gcc constexpr'ing and nvcc #define isnan std::isnan #endif -template<typename scalar_t> +// Here sampler_t should be function type scalar_t(void). For gpu +// "sampler" is a device function, but since ROCM doesn't have +// equivalent to nvstd::function, we use a template type parameter to +// capture it. +template<typename scalar_t, typename sampler_t> struct BaseSampler { - nvfunction_or_function<scalar_t(void)> sampler; - deviceforcuda BaseSampler(nvfunction_or_function<scalar_t(void)> sampler): sampler(sampler) {} - deviceforcuda scalar_t sample() { + sampler_t sampler; + C10_DEVICE BaseSampler(const sampler_t& sampler): sampler(sampler) {} + C10_DEVICE scalar_t sample() { return sampler(); } }; @@ -69,21 +85,21 @@ struct BaseSampler { * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -template<typename scalar_t, typename accscalar_t> -deviceforcuda scalar_t sample_gamma(scalar_t alpha, BaseSampler<accscalar_t>& standard_uniform, BaseSampler<accscalar_t>& standard_normal) { +template<typename scalar_t, typename accscalar_t, typename uniform_sampler_t, typename normal_sampler_t> +C10_DEVICE scalar_t sample_gamma(scalar_t alpha, BaseSampler<accscalar_t, uniform_sampler_t>& standard_uniform, BaseSampler<accscalar_t, normal_sampler_t>& standard_normal) { accscalar_t scale = 1.0f; // Boost alpha for higher acceptance probability. if (alpha < 1.0f) { if (alpha == 0.f) return 0.f; - scale *= std::pow(1 - standard_uniform.sample(), 1.0f / alpha); + scale *= compat_pow(1 - standard_uniform.sample(), 1.0f / alpha); alpha += 1.0f; } // This implements the acceptance-rejection method of Marsaglia and Tsang (2000) // doi:10.1145/358407.358414 const accscalar_t d = alpha - 1.0f / 3.0f; - const accscalar_t c = 1.0f / std::sqrt(9.0f * d); + const accscalar_t c = 1.0f / compat_sqrt(9.0f * d); for (;;) { accscalar_t x, y; do { @@ -95,13 +111,13 @@ deviceforcuda scalar_t sample_gamma(scalar_t alpha, BaseSampler<accscalar_t>& st const accscalar_t xx = x * x; if (u < 1.0f - 0.0331f * xx * xx) return static_cast<scalar_t>(scale * d * v); - if (std::log(u) < 0.5f * xx + d * (1.0f - v + std::log(v))) + if (compat_log(u) < 0.5f * xx + d * (1.0f - v + compat_log(v))) return static_cast<scalar_t>(scale * d * v); } } template <typename scalar_t> -deviceforcuda static inline scalar_t polevl(const scalar_t x, const scalar_t A[], size_t len) { +C10_DEVICE static inline scalar_t polevl(const scalar_t x, const scalar_t A[], size_t len) { scalar_t result = 0; for (size_t i = 0; i <= len; i++) { result = result * x + A[i]; @@ -118,20 +134,21 @@ deviceforcuda static inline scalar_t polevl(const scalar_t x, const scalar_t A[ * Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier */ template<typename scalar_t, typename accscalar_t> -deviceforcuda static inline scalar_t digamma_one(scalar_t x) { +C10_DEVICE static inline scalar_t digamma_one(scalar_t x) { constexpr accscalar_t PSI_10 = 2.25175258906672110764; if (x == 0) { return INFINITY; } accscalar_t additional_summand = 0; - int x_is_integer = x == std::floor(x); + int x_is_integer = x == compat_floor(x); if (x < 0) { if (x_is_integer) { return INFINITY; } // it is more standard to write this as recursion, but // nvcc does not like that - additional_summand = - static_cast<accscalar_t>(M_PI) / std::tan(static_cast<accscalar_t>(M_PI) * x); + additional_summand = -static_cast<accscalar_t>(M_PI) / + compat_tan(static_cast<accscalar_t>(M_PI) * x); x = 1 - x; } @@ -161,13 +178,14 @@ deviceforcuda static inline scalar_t digamma_one(scalar_t x) { accscalar_t z = 1.0 / (x * x); y = z * polevl<accscalar_t>(z, A, 6); } - return static_cast<scalar_t>(result + std::log(x) - (0.5f / x) - y + additional_summand); + return static_cast<scalar_t>( + result + compat_log(x) - (0.5f / x) - y + additional_summand); } // Computes the reparameterized gradient -(d/dalpha cdf(x;alpha)) / pdf(x;alpha) // for random number x drawn from a standard Gamma distribution Gamma(alpha). template <typename scalar_t, typename accscalar_t> -deviceforcuda scalar_t standard_gamma_grad_one(scalar_t alpha_, scalar_t x_) { +C10_DEVICE scalar_t standard_gamma_grad_one(scalar_t alpha_, scalar_t x_) { // Use a Taylor series expansion for small x. accscalar_t x = static_cast<accscalar_t>(x_); accscalar_t alpha = static_cast<accscalar_t>(alpha_); @@ -182,11 +200,13 @@ deviceforcuda scalar_t standard_gamma_grad_one(scalar_t alpha_, scalar_t x_) { series1 += numer / denom; series2 += numer / (denom * denom); } - const auto pow_x_alpha = std::pow(x, alpha); - const auto gamma_pdf = std::pow(x, alpha - 1) * std::exp(-x); + const auto pow_x_alpha = compat_pow(x, alpha); + const auto gamma_pdf = compat_pow(x, alpha - 1) * compat_exp(-x); const auto gamma_cdf = pow_x_alpha * series1; - const auto gamma_cdf_alpha = (std::log(x) - digamma_one<accscalar_t,accscalar_t>(alpha)) * gamma_cdf - - pow_x_alpha * series2; + const auto gamma_cdf_alpha = + (compat_log(x) - digamma_one<accscalar_t, accscalar_t>(alpha)) * + gamma_cdf - + pow_x_alpha * series2; const auto result = -gamma_cdf_alpha / gamma_pdf; return isnan(result) ? static_cast<scalar_t>( 0.f ) : static_cast<scalar_t>(result); } @@ -200,20 +220,22 @@ deviceforcuda scalar_t standard_gamma_grad_one(scalar_t alpha_, scalar_t x_) { const auto denom = 1244160 * (alpha * alpha) * (alpha * alpha); return static_cast<scalar_t>(numer_1 * numer_2 / denom); } - const auto denom = std::sqrt(8 * alpha); + const auto denom = compat_sqrt(8 * alpha); const auto term2 = denom / (alpha - x); - const auto term3 = std::pow(x - alpha - alpha * std::log(x / alpha), static_cast<accscalar_t>(-1.5)); + const auto term3 = compat_pow( + x - alpha - alpha * compat_log(x / alpha), + static_cast<accscalar_t>(-1.5)); const auto term23 = (x < alpha) ? term2 - term3 : term2 + term3; - const auto term1 = std::log(x / alpha) * term23 - - std::sqrt(2 / alpha) * (alpha + x) / ((alpha - x) * (alpha - x)); + const auto term1 = compat_log(x / alpha) * term23 - + compat_sqrt(2 / alpha) * (alpha + x) / ((alpha - x) * (alpha - x)); const auto stirling = 1 + 1 / (12 * alpha) * (1 + 1 / (24 * alpha)); const auto numer = x * term1; return static_cast<scalar_t>(-stirling * numer / denom); } // Use a bivariate rational approximation to the reparameterized gradient. - const auto u = std::log(x / alpha); - const auto v = std::log(alpha); + const auto u = compat_log(x / alpha); + const auto v = compat_log(alpha); static const accscalar_t coef_uv[3][8] = { {0.16009398, -0.094634809, 0.025146376, -0.0030648343, 1, 0.32668115, 0.10406089, 0.0014179084}, @@ -228,7 +250,7 @@ deviceforcuda scalar_t standard_gamma_grad_one(scalar_t alpha_, scalar_t x_) { } const auto p = coef_v[0] + v * (coef_v[1] + v * (coef_v[2] + v * coef_v[3])); const auto q = coef_v[4] + v * (coef_v[5] + v * (coef_v[6] + v * coef_v[7])); - return static_cast<scalar_t>(std::exp(p / q)); + return static_cast<scalar_t>(compat_exp(p / q)); } } // namespace diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu index 4ee3bb95cc..00a1b34483 100644 --- a/aten/src/ATen/native/cuda/Distributions.cu +++ b/aten/src/ATen/native/cuda/Distributions.cu @@ -9,7 +9,6 @@ #include <curand_philox4x32_x.h> #include <utility> #include <functional> -#include <nvfunctional> #include <ATen/native/Distributions.h> @@ -72,13 +71,17 @@ void gamma_cuda_kernel( blockIdx.x * blockDim.x + threadIdx.x, seeds.second, &state); - BaseSampler<accscalar_t> standard_uniform([&state] __device__ () { + + auto uniform_lambda = [&state] __device__ () { return curand_uniform(&state); - }); - BaseSampler<accscalar_t> standard_normal([&state] __device__ () { + }; + BaseSampler<accscalar_t, decltype(uniform_lambda)> standard_uniform(uniform_lambda); + + auto normal_lambda = [&state] __device__ () { return curand_normal(&state); - }); - auto sample = sample_gamma<scalar_t, accscalar_t>(alpha, standard_uniform, standard_normal); + }; + BaseSampler<accscalar_t, decltype(normal_lambda)> standard_normal(normal_lambda); + auto sample = sample_gamma<scalar_t, accscalar_t, decltype(uniform_lambda), decltype(normal_lambda)>(alpha, standard_uniform, standard_normal); auto min_value = std::numeric_limits<scalar_t>::lowest(); ret_val = (min_value > sample) ? min_value : sample; }); diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu index 73c3b0012d..51ae225fcc 100644 --- a/aten/src/THC/THCBlas.cu +++ b/aten/src/THC/THCBlas.cu @@ -509,6 +509,7 @@ void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, i /* Inverse */ void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize) { +#ifndef __HIP_PLATFORM_HCC__ if( (n >= INT_MAX) || (lda >= INT_MAX) || (batchSize >= INT_MAX) ) { THError("Cublas_Sgetrf only supports n, lda, batchSize" @@ -517,9 +518,13 @@ void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, i cublasHandle_t handle = THCState_getCurrentBlasHandle(state); cublasSetStream(handle, THCState_getCurrentStream(state)); THCublasCheck(cublasSgetrfBatched(handle, n, a, lda, pivot, info, batchSize)); +#else + THError("THCudaBlas_Sgetrf not supported in ROCM."); +#endif } void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, int *info, int batchSize) { +#ifndef __HIP_PLATFORM_HCC__ if( (n >= INT_MAX) || (lda >= INT_MAX) || (batchSize >= INT_MAX) ) { THError("Cublas_Dgetrf only supports n, lda, batchSize" @@ -528,10 +533,14 @@ void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, cublasHandle_t handle = THCState_getCurrentBlasHandle(state); cublasSetStream(handle, THCState_getCurrentStream(state)); THCublasCheck(cublasDgetrfBatched(handle, n, a, lda, pivot, info, batchSize)); +#else + THError("THCudaBlas_Dgetrf not supported in ROCM."); +#endif } void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const float **a, int lda, int *pivot, float **b, int ldb, int *info, int batchSize) { +#ifndef __HIP_PLATFORM_HCC__ if( (n >= INT_MAX) || (nrhs >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (batchSize >= INT_MAX) ) { THError("Cublas_Dgetrs only supports n, nrhs, lda, ldb, batchSize" @@ -544,11 +553,15 @@ void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const floa cublasHandle_t handle = THCState_getCurrentBlasHandle(state); cublasSetStream(handle, THCState_getCurrentStream(state)); THCublasCheck(cublasSgetrsBatched(handle, opa, n, nrhs, a, lda, pivot, b, ldb, info, batchSize)); +#else + THError("THCudaBlas_Sgetrs not supported in ROCM."); +#endif } void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const double **a, int lda, int *pivot, double **b, int ldb, int *info, int batchSize) { +#ifndef __HIP_PLATFORM_HCC__ if( (n >= INT_MAX) || (nrhs >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (batchSize >= INT_MAX) ) { THError("Cublas_Dgetrs only supports n, nrhs, lda, ldb, batchSize" @@ -561,10 +574,13 @@ void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const doub cublasHandle_t handle = THCState_getCurrentBlasHandle(state); cublasSetStream(handle, THCState_getCurrentStream(state)); THCublasCheck(cublasDgetrsBatched(handle, opa, n, nrhs, a, lda, pivot, b, ldb, info, batchSize)); +#else + THError("THCudaBlas_Dgetrs not supported in ROCM."); +#endif } void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize) { - +#ifndef __HIP_PLATFORM_HCC__ if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) ) { THError("Cublas_Sgetri only supports n, lda, ldc, batchSize" @@ -573,10 +589,13 @@ void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pi cublasHandle_t handle = THCState_getCurrentBlasHandle(state); cublasSetStream(handle, THCState_getCurrentStream(state)); THCublasCheck(cublasSgetriBatched(handle, n, a, lda, pivot, c, ldc, info, batchSize)); +#else + THError("THCudaBlas_Sgetri not supported in ROCM."); +#endif } void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *pivot, double **c, int ldc, int *info, int batchSize) { - +#ifndef __HIP_PLATFORM_HCC__ if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) ) { THError("Cublas_Dgetri only supports n, lda, ldc, batchSize" @@ -585,4 +604,7 @@ void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *p cublasHandle_t handle = THCState_getCurrentBlasHandle(state); cublasSetStream(handle, THCState_getCurrentStream(state)); THCublasCheck(cublasDgetriBatched(handle, n, a, lda, pivot, c, ldc, info, batchSize)); +#else + THError("THCudaBlas_Dgetri not supported in ROCM."); +#endif } diff --git a/aten/src/THC/THCGenerator.hpp b/aten/src/THC/THCGenerator.hpp index ea5d1ba347..f1c411985b 100644 --- a/aten/src/THC/THCGenerator.hpp +++ b/aten/src/THC/THCGenerator.hpp @@ -7,8 +7,8 @@ #include <mutex> typedef struct THCGeneratorState { - struct curandStateMtgp32* gen_states; - struct mtgp32_kernel_params *kernel_params; + curandStateMtgp32* gen_states; + mtgp32_kernel_params *kernel_params; int initf; uint64_t initial_seed; std::atomic<int64_t> philox_seed_offset; diff --git a/aten/src/THC/THCTensorRandom.cpp b/aten/src/THC/THCTensorRandom.cpp index 5853d9caea..e3cf5d9353 100644 --- a/aten/src/THC/THCTensorRandom.cpp +++ b/aten/src/THC/THCTensorRandom.cpp @@ -87,7 +87,7 @@ THCGenerator* THCRandom_getGenerator(THCState* state) return gen; } -struct curandStateMtgp32* THCRandom_generatorStates(struct THCState* state) +curandStateMtgp32* THCRandom_generatorStates(THCState* state) { THCGenerator* gen = THCRandom_getGenerator(state); return gen->state.gen_states; diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu index 69228aec96..58bbabc6a3 100644 --- a/aten/src/THC/THCTensorRandom.cu +++ b/aten/src/THC/THCTensorRandom.cu @@ -11,8 +11,6 @@ #include <ATen/cuda/_curand_mtgp32_host.h> #include <thrust/functional.h> -#include <curand.h> -#include <curand_kernel.h> #define MAX_NUM_BLOCKS 200 #define BLOCK_SIZE 256 @@ -23,7 +21,7 @@ THCGenerator* THCRandom_getGenerator(THCState* state); /* Sets up generator. Allocates but does not create the generator states. Not thread-safe. */ __host__ void initializeGenerator(THCState *state, THCGenerator* gen) { - gen->state.gen_states = static_cast<struct curandStateMtgp32*>(THCudaMalloc(state, MAX_NUM_BLOCKS * sizeof(curandStateMtgp32))); + gen->state.gen_states = static_cast<curandStateMtgp32*>(THCudaMalloc(state, MAX_NUM_BLOCKS * sizeof(curandStateMtgp32))); gen->state.kernel_params = static_cast<mtgp32_kernel_params*>(THCudaMalloc(state, sizeof(mtgp32_kernel_params))); } @@ -44,7 +42,7 @@ __host__ void createGeneratorState(THCGenerator* gen, uint64_t seed) gen->state.philox_seed_offset = 0; } -__host__ void THCRandom_getRNGState(THCState* state, THByteTensor *rng_state) +THC_API __host__ void THCRandom_getRNGState(THCState* state, THByteTensor *rng_state) { THCGenerator* gen = THCRandom_getGenerator(state); std::lock_guard<std::mutex> lock(gen->mutex); @@ -65,10 +63,14 @@ __host__ void THCRandom_getRNGState(THCState* state, THByteTensor *rng_state) __global__ void set_rngstate_kernel(curandStateMtgp32 *state, mtgp32_kernel_params *kernel) { +#ifndef __HIP_PLATFORM_HCC__ state[threadIdx.x].k = kernel; +#else + state[threadIdx.x].set_params(kernel); +#endif } -__host__ void THCRandom_setRNGState(THCState* state, THByteTensor *rng_state) +THC_API __host__ void THCRandom_setRNGState(THCState* state, THByteTensor *rng_state) { THCGenerator* gen = THCRandom_getGenerator(state); std::lock_guard<std::mutex> lock(gen->mutex); @@ -118,7 +120,7 @@ __device__ inline at::Half half_uniform_scale_and_shift(float x, double a, doubl } #define GENERATE_KERNEL1(NAME, T, ARG1, CURAND_T, CURAND_FUNC, TRANSFORM) \ -__global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1) \ +__global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1) \ { \ int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; \ int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE; \ @@ -132,7 +134,7 @@ __global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1) \ } #define GENERATE_KERNEL2(NAME, T, ARG1, ARG2, CURAND_T, CURAND_FUNC, TRANSFORM) \ -__global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1, ARG2) \ +__global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1, ARG2) \ { \ int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; \ int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE; \ diff --git a/aten/src/THC/THCTensorRandom.h b/aten/src/THC/THCTensorRandom.h index 0dc8c2a62d..265b7ce87d 100644 --- a/aten/src/THC/THCTensorRandom.h +++ b/aten/src/THC/THCTensorRandom.h @@ -5,9 +5,9 @@ #include <THC/generic/THCTensorRandom.h> #include <THC/THCGenerateAllTypes.h> -#ifdef __HIP_PLATFORM_HCC__ -#include <hiprand_kernel.h> -#endif + +#include <curand.h> +#include <curand_kernel.h> typedef struct THCGenerator THCGenerator; @@ -29,6 +29,6 @@ THC_API uint64_t THCRandom_initialSeed(struct THCState *state); THC_API void THCRandom_getRNGState(struct THCState *state, THByteTensor *rng_state); THC_API void THCRandom_setRNGState(struct THCState *state, THByteTensor *rng_state); -THC_API struct curandStateMtgp32* THCRandom_generatorStates(struct THCState* state); +THC_API curandStateMtgp32* THCRandom_generatorStates(struct THCState* state); #endif diff --git a/aten/src/THCUNN/generic/RReLU.cu b/aten/src/THCUNN/generic/RReLU.cu index 2cbc4b9378..654ea1408c 100644 --- a/aten/src/THCUNN/generic/RReLU.cu +++ b/aten/src/THCUNN/generic/RReLU.cu @@ -16,7 +16,7 @@ void THNN_(RReLU_updateOutput)( void *generator) { THCUNN_assertSameGPU(state, 3, input, output, noise); - struct curandStateMtgp32* gen_states = THCRandom_generatorStates(state); + curandStateMtgp32* gen_states = THCRandom_generatorStates(state); if (train) { diff --git a/c10/cuda/CUDAMathCompat.h b/c10/cuda/CUDAMathCompat.h index 35176e729e..63565150e6 100644 --- a/c10/cuda/CUDAMathCompat.h +++ b/c10/cuda/CUDAMathCompat.h @@ -22,33 +22,68 @@ namespace cuda { namespace compat { __MATH_FUNCTIONS_DECL__ float abs(float x) { - return fabsf(x); + return ::fabsf(x); } __MATH_FUNCTIONS_DECL__ double abs(double x) { - return fabs(x); + return ::fabs(x); +} + +__MATH_FUNCTIONS_DECL__ float exp(float x) { + return ::expf(x); +} +__MATH_FUNCTIONS_DECL__ double exp(double x) { + return ::exp(x); +} + +__MATH_FUNCTIONS_DECL__ float floor(float x) { + return ::floorf(x); +} +__MATH_FUNCTIONS_DECL__ double floor(double x) { + return ::floor(x); +} + +__MATH_FUNCTIONS_DECL__ float log(float x) { + return ::logf(x); +} +__MATH_FUNCTIONS_DECL__ double log(double x) { + return ::log(x); } __MATH_FUNCTIONS_DECL__ float max(float x, float y) { - return fmaxf(x, y); + return ::fmaxf(x, y); } __MATH_FUNCTIONS_DECL__ double max(double x, double y) { - return fmax(x, y); + return ::fmax(x, y); } __MATH_FUNCTIONS_DECL__ float pow(float x, float y) { - return powf(x, y); + return ::powf(x, y); } __MATH_FUNCTIONS_DECL__ double pow(double x, double y) { return ::pow(x, y); } __MATH_FUNCTIONS_DECL__ void sincos(float x, float* sptr, float* cptr) { - return sincosf(x, sptr, cptr); + return ::sincosf(x, sptr, cptr); } __MATH_FUNCTIONS_DECL__ void sincos(double x, double* sptr, double* cptr) { return ::sincos(x, sptr, cptr); } +__MATH_FUNCTIONS_DECL__ float sqrt(float x) { + return ::sqrtf(x); +} +__MATH_FUNCTIONS_DECL__ double sqrt(double x) { + return ::sqrt(x); +} + +__MATH_FUNCTIONS_DECL__ float tan(float x) { + return ::tanf(x); +} +__MATH_FUNCTIONS_DECL__ double tan(double x) { + return ::tan(x); +} + } // namespace compat } // namespace cuda } // namespace c10 diff --git a/test/test_distributions.py b/test/test_distributions.py index de1c43e19d..993e5d9e1a 100644 --- a/test/test_distributions.py +++ b/test/test_distributions.py @@ -1975,7 +1975,6 @@ class TestDistributions(TestCase): @unittest.skipIf(not TEST_CUDA, "CUDA not found") @unittest.skipIf(not TEST_NUMPY, "Numpy not found") - @skipIfRocm def test_gamma_gpu_sample(self): set_rng_seed(0) for alpha, beta in product([0.1, 1.0, 5.0], [0.1, 1.0, 10.0]): diff --git a/tools/amd_build/disabled_features.json b/tools/amd_build/disabled_features.json index f7864dc9e5..c7228b7ad1 100644 --- a/tools/amd_build/disabled_features.json +++ b/tools/amd_build/disabled_features.json @@ -1,131 +1,8 @@ { - "disable_unsupported_hip_calls": - [ - { - "path": "aten/src/THC/THCBlas.cu", - "functions": { - "cublasSgemmEx": "rocblas_status_internal_error", - "cublasSgetrfBatched": "rocblas_status_internal_error", - "cublasDgetrfBatched": "rocblas_status_internal_error", - "cublasSgetrsBatched": "rocblas_status_internal_error", - "cublasDgetrsBatched": "rocblas_status_internal_error", - "cublasSgetriBatched": "rocblas_status_internal_error", - "cublasDgetriBatched": "rocblas_status_internal_error" - } - }, - { - "path": "aten/src/THC/THCStream.cpp", - "functions": { - "cudaStreamCreateWithFlags": "hipSuccess", - "cudaStreamCreateWithPriority": "hipSuccess" - } - }, - { - "path": "aten/src/THC/THCAllocator.cpp", - "functions": { - "cudaMallocManaged": "hipSuccess" - } - }, - { - "path": "aten/src/ATen/native/cuda/Distributions.cu", - "s_constants": { - "#include <nvfunctional>": "" - } - }, - { - "path": "aten/src/ATen/native/cuda/RoiPooling.cu", - "s_constants": { - "RoiPooling2d_forward_kernel<<<": "RoiPooling2d_forward_kernel<float><<<" - } - }, - { - "path": "aten/src/THC/THCTensorRandom.cpp", - "s_constants": { - "struct curandStateMtgp32*": "curandStateMtgp32*" - } - }, - { - "path": "aten/src/THC/THCTensorRandom.cu", - "s_constants": { - "struct curandStateMtgp32*": "curandStateMtgp32*", - "__host__ void THCRandom_getRNGState": "extern \"C\" __host__ void THCRandom_getRNGState", - "__host__ void THCRandom_setRNGState": "extern \"C\" __host__ void THCRandom_setRNGState", - "state[threadIdx.x].k = kernel;" : "state[threadIdx.x].set_params(kernel);" - } - }, - { - "path": "aten/src/THC/THCTensorRandom.h", - "s_constants": { - "struct curandStateMtgp32*": "curandStateMtgp32*" - } - }, - { - "path": "aten/src/THCUNN/generic/RReLU.cu", - "s_constants": { - "struct curandStateMtgp32*": "curandStateMtgp32*" - } - }, - { - "path": "aten/src/THC/THCGenerator.hpp", - "s_constants": { - "struct curandStateMtgp32*": "curandStateMtgp32*", - "struct mtgp32_kernel_params": "mtgp32_kernel_params" - } - }, - { - "path": "aten/src/ATen/native/cuda/RoiPooling.cu", - "s_constants": { - "RoiPooling2d_backward_kernel<<<": "RoiPooling2d_backward_kernel<float><<<" - } - }, - { - "path": "aten/src/ATen/native/cuda/Unique.cu", - "s_constants": { - "inverse_indices_kernel<<<": "inverse_indices_kernel<scalar_t><<<" - } - } - ], + "disable_unsupported_hip_calls": [ + ], "disabled_modules": [ ], "disabled_functions": [ - { - "path": "aten/src/ATen/cuda/CUDAApplyUtils.cuh", - "functions": [ - "kernelPointwiseApply4" - ] - }, - { - "path": "aten/src/THCUNN/LookupTable.cu", - "functions": [ - "warpHasCollision" - ] - }, - { - "path": "aten/src/ATen/native/cuda/Distributions.cu", - "functions": [ - "gamma_cuda_kernel", - "gamma_grad_cuda_kernel" - ] - }, - { - "path": "aten/src/THCUNN/generic/SparseLinear.cu", - "functions": [ - "THNN_(SparseLinear_updateOutput)", - "THNN_(SparseLinear_accGradParameters)" - ] - }, - { - "path": "aten/src/THCUNN/generic/LookupTable.cu", - "functions": [ - "THNN_(LookupTable_accGradParameters)", - "THNN_(LookupTable_renorm)" - ] - }, - { - "path": "aten/src/THC/generic/THCTensor.cu", - "functions": [ - "THCTensor_(getTextureObject)" - ] - } ] } diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py index 53cfeb35a9..87931f1aa5 100644 --- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py +++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py @@ -276,7 +276,6 @@ CUDA_INCLUDE_MAP = collections.OrderedDict([ ("cusparse.h", ("hipsparse.h", CONV_INCLUDE, API_RAND)), ("cufft.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)), ("cufftXt.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)), - ("#include <nvfunctional>", ("", CONV_INCLUDE, API_RAND, HIP_UNSUPPORTED)), ]) CUDA_IDENTIFIER_MAP = collections.OrderedDict([ @@ -2179,13 +2178,16 @@ CUDA_SPARSE_MAP = collections.OrderedDict([ ("cusparseStatus_t", ("hipsparseStatus_t", CONV_MATH_FUNC, API_SPARSE)), ("cusparseHandle_t", ("hipsparseHandle_t", CONV_MATH_FUNC, API_SPARSE)), ("cusparseOperation_t", ("hipsparseOperation_t", CONV_TYPE, API_SPARSE)), + ("cusparseCreateMatDescr", ("hipsparseCreateMatDescr", CONV_MATH_FUNC, API_SPARSE)), ("cusparseCreate", ("hipsparseCreate", CONV_MATH_FUNC, API_SPARSE)), + ("cusparseDestroyMatDescr", ("hipsparseDestroyMatDescr", CONV_MATH_FUNC, API_SPARSE)), ("cusparseDestroy", ("hipsparseDestroy", CONV_MATH_FUNC, API_SPARSE)), ("cusparseXcoo2csr", ("hipsparseXcoo2csr", CONV_MATH_FUNC, API_SPARSE)), ("cusparseMatDescr_t", ("hipsparseMatDescr_t", CONV_MATH_FUNC, API_SPARSE)), - ("cusparseCreateMatDescr", ("hipsparseCreateMatDescr", CONV_MATH_FUNC, API_SPARSE)), ("cusparseScsrmm2", ("hipsparseScsrmm2", CONV_MATH_FUNC, API_SPARSE)), ("cusparseDcsrmm2", ("hipsparseDcsrmm2", CONV_MATH_FUNC, API_SPARSE)), + ("cusparseScsrmm", ("hipsparseScsrmm", CONV_MATH_FUNC, API_SPARSE)), + ("cusparseDcsrmm", ("hipsparseDcsrmm", CONV_MATH_FUNC, API_SPARSE)), ("cusparseXcsrsort_bufferSizeExt", ("hipsparseXcsrsort_bufferSizeExt", CONV_MATH_FUNC, API_SPARSE)), ("cusparseXcsrsort", ("hipsparseXcsrsort", CONV_MATH_FUNC, API_SPARSE)), ("cusparseXcoosort_bufferSizeExt", ("hipsparseXcoosort_bufferSizeExt", CONV_MATH_FUNC, API_SPARSE)), @@ -2193,6 +2195,7 @@ CUDA_SPARSE_MAP = collections.OrderedDict([ ("cusparseSetStream", ("hipsparseSetStream", CONV_MATH_FUNC, API_SPARSE)), ("cusparseCreateIdentityPermutation", ("hipsparseCreateIdentityPermutation", CONV_MATH_FUNC, API_SPARSE)), ("cusparseSetMatIndexBase", ("hipsparseSetMatIndexBase", CONV_MATH_FUNC, API_SPARSE)), + ("cusparseSetMatType", ("hipsparseSetMatType", CONV_MATH_FUNC, API_SPARSE)), ("CUSPARSE_STATUS_SUCCESS", ("HIPSPARSE_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_SPARSE)), ("CUSPARSE_STATUS_NOT_INITIALIZED", ("HIPSPARSE_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_SPARSE)), ("CUSPARSE_STATUS_ALLOC_FAILED", ("HIPSPARSE_STATUS_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_SPARSE)), @@ -2208,6 +2211,7 @@ CUDA_SPARSE_MAP = collections.OrderedDict([ ("CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE", ("HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE", CONV_NUMERIC_LITERAL, API_SPARSE)), ("CUSPARSE_INDEX_BASE_ZERO", ("HIPSPARSE_INDEX_BASE_ZERO", CONV_NUMERIC_LITERAL, API_SPARSE)), ("CUSPARSE_INDEX_BASE_ONE", ("HIPSPARSE_INDEX_BASE_ONE", CONV_NUMERIC_LITERAL, API_SPARSE)), + ("CUSPARSE_MATRIX_TYPE_GENERAL", ("HIPSPARSE_MATRIX_TYPE_GENERAL", CONV_NUMERIC_LITERAL, API_SPARSE)), ]) PYTORCH_SPECIFIC_MAPPINGS = collections.OrderedDict([ |