#include // CUDA's, not caffe's, for fabs, signbit #include #include // thrust::plus #include #include #include "caffe/common.hpp" #include "caffe/util/math_functions.hpp" namespace caffe { template <> void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C) { // Note that cublas follows fortran order. int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; CUBLAS_CHECK(cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); } template <> void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, double* C) { // Note that cublas follows fortran order. int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); } template <> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float* A, const float* x, const float beta, float* y) { cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; CUBLAS_CHECK(cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); } template <> void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, const double* A, const double* x, const double beta, double* y) { cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; CUBLAS_CHECK(cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); } template <> void caffe_gpu_axpy(const int N, const float alpha, const float* X, float* Y) { CUBLAS_CHECK(cublasSaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1)); } template <> void caffe_gpu_axpy(const int N, const double alpha, const double* X, double* Y) { CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1)); } void caffe_gpu_memcpy(const size_t N, const void* X, void* Y) { if (X != Y) { CUDA_CHECK(cudaMemcpy(Y, X, N, cudaMemcpyDefault)); // NOLINT(caffe/alt_fn) } } template <> void caffe_gpu_scal(const int N, const float alpha, float *X) { CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1)); } template <> void caffe_gpu_scal(const int N, const double alpha, double *X) { CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1)); } template <> void caffe_gpu_scal(const int N, const float alpha, float* X, cudaStream_t str) { cudaStream_t initial_stream; CUBLAS_CHECK(cublasGetStream(Caffe::cublas_handle(), &initial_stream)); CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), str)); CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1)); CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), initial_stream)); } template <> void caffe_gpu_scal(const int N, const double alpha, double* X, cudaStream_t str) { cudaStream_t initial_stream; CUBLAS_CHECK(cublasGetStream(Caffe::cublas_handle(), &initial_stream)); CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), str)); CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1)); CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), initial_stream)); } template <> void caffe_gpu_axpby(const int N, const float alpha, const float* X, const float beta, float* Y) { caffe_gpu_scal(N, beta, Y); caffe_gpu_axpy(N, alpha, X, Y); } template <> void caffe_gpu_axpby(const int N, const double alpha, const double* X, const double beta, double* Y) { caffe_gpu_scal(N, beta, Y); caffe_gpu_axpy(N, alpha, X, Y); } template <> void caffe_gpu_dot(const int n, const float* x, const float* y, float* out) { CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); } template <> void caffe_gpu_dot(const int n, const double* x, const double* y, double * out) { CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); } template <> void caffe_gpu_asum(const int n, const float* x, float* y) { CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y)); } template <> void caffe_gpu_asum(const int n, const double* x, double* y) { CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y)); } template <> void caffe_gpu_scale(const int n, const float alpha, const float *x, float* y) { CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1)); CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1)); } template <> void caffe_gpu_scale(const int n, const double alpha, const double *x, double* y) { CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1)); CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1)); } template __global__ void set_kernel(const int n, const Dtype alpha, Dtype* y) { CUDA_KERNEL_LOOP(index, n) { y[index] = alpha; } } template void caffe_gpu_set(const int N, const Dtype alpha, Dtype* Y) { if (alpha == 0) { CUDA_CHECK(cudaMemset(Y, 0, sizeof(Dtype) * N)); // NOLINT(caffe/alt_fn) return; } // NOLINT_NEXT_LINE(whitespace/operators) set_kernel<<>>( N, alpha, Y); } template void caffe_gpu_set(const int N, const int alpha, int* Y); template void caffe_gpu_set(const int N, const float alpha, float* Y); template void caffe_gpu_set(const int N, const double alpha, double* Y); template __global__ void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) { CUDA_KERNEL_LOOP(index, n) { y[index] += alpha; } } template <> void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) { // NOLINT_NEXT_LINE(whitespace/operators) add_scalar_kernel<<>>( N, alpha, Y); } template <> void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) { // NOLINT_NEXT_LINE(whitespace/operators) add_scalar_kernel<<>>( N, alpha, Y); } template __global__ void add_kernel(const int n, const Dtype* a, const Dtype* b, Dtype* y) { CUDA_KERNEL_LOOP(index, n) { y[index] = a[index] + b[index]; } } template <> void caffe_gpu_add(const int N, const float* a, const float* b, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) add_kernel<<>>( N, a, b, y); } template <> void caffe_gpu_add(const int N, const double* a, const double* b, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) add_kernel<<>>( N, a, b, y); } template __global__ void sub_kernel(const int n, const Dtype* a, const Dtype* b, Dtype* y) { CUDA_KERNEL_LOOP(index, n) { y[index] = a[index] - b[index]; } } template <> void caffe_gpu_sub(const int N, const float* a, const float* b, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) sub_kernel<<>>( N, a, b, y); } template <> void caffe_gpu_sub(const int N, const double* a, const double* b, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) sub_kernel<<>>( N, a, b, y); } template __global__ void mul_kernel(const int n, const Dtype* a, const Dtype* b, Dtype* y) { CUDA_KERNEL_LOOP(index, n) { y[index] = a[index] * b[index]; } } template <> void caffe_gpu_mul(const int N, const float* a, const float* b, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) mul_kernel<<>>( N, a, b, y); } template <> void caffe_gpu_mul(const int N, const double* a, const double* b, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) mul_kernel<<>>( N, a, b, y); } template __global__ void div_kernel(const int n, const Dtype* a, const Dtype* b, Dtype* y) { CUDA_KERNEL_LOOP(index, n) { y[index] = a[index] / b[index]; } } template <> void caffe_gpu_div(const int N, const float* a, const float* b, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) div_kernel<<>>( N, a, b, y); } template <> void caffe_gpu_div(const int N, const double* a, const double* b, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) div_kernel<<>>( N, a, b, y); } template __global__ void abs_kernel(const int n, const Dtype* a, Dtype* y) { CUDA_KERNEL_LOOP(index, n) { y[index] = abs(a[index]); } } template <> void caffe_gpu_abs(const int N, const float* a, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) abs_kernel<<>>( N, a, y); } template <> void caffe_gpu_abs(const int N, const double* a, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) abs_kernel<<>>( N, a, y); } template __global__ void exp_kernel(const int n, const Dtype* a, Dtype* y) { CUDA_KERNEL_LOOP(index, n) { y[index] = exp(a[index]); } } template <> void caffe_gpu_exp(const int N, const float* a, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) exp_kernel<<>>( N, a, y); } template <> void caffe_gpu_exp(const int N, const double* a, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) exp_kernel<<>>( N, a, y); } template __global__ void log_kernel(const int n, const Dtype* a, Dtype* y) { CUDA_KERNEL_LOOP(index, n) { y[index] = log(a[index]); } } template <> void caffe_gpu_log(const int N, const float* a, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) log_kernel<<>>( N, a, y); } template <> void caffe_gpu_log(const int N, const double* a, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) log_kernel<<>>( N, a, y); } template __global__ void powx_kernel(const int n, const Dtype* a, const Dtype alpha, Dtype* y) { CUDA_KERNEL_LOOP(index, n) { y[index] = pow(a[index], alpha); } } template <> void caffe_gpu_powx(const int N, const float* a, const float alpha, float* y) { // NOLINT_NEXT_LINE(whitespace/operators) powx_kernel<<>>( N, a, alpha, y); } template <> void caffe_gpu_powx(const int N, const double* a, const double alpha, double* y) { // NOLINT_NEXT_LINE(whitespace/operators) powx_kernel<<>>( N, a, alpha, y); } DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index]) - (x[index] < Dtype(0))); DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index])); void caffe_gpu_rng_uniform(const int n, unsigned int* r) { CURAND_CHECK(curandGenerate(Caffe::curand_generator(), r, n)); } template <> void caffe_gpu_rng_uniform(const int n, const float a, const float b, float* r) { CURAND_CHECK(curandGenerateUniform(Caffe::curand_generator(), r, n)); const float range = b - a; if (range != static_cast(1)) { caffe_gpu_scal(n, range, r); } if (a != static_cast(0)) { caffe_gpu_add_scalar(n, a, r); } } template <> void caffe_gpu_rng_uniform(const int n, const double a, const double b, double* r) { CURAND_CHECK(curandGenerateUniformDouble(Caffe::curand_generator(), r, n)); const double range = b - a; if (range != static_cast(1)) { caffe_gpu_scal(n, range, r); } if (a != static_cast(0)) { caffe_gpu_add_scalar(n, a, r); } } template <> void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma, float* r) { CURAND_CHECK( curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma)); } template <> void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma, double* r) { CURAND_CHECK( curandGenerateNormalDouble(Caffe::curand_generator(), r, n, mu, sigma)); } } // namespace caffe