From 9ab077dc9d0bbe651348a498dd5472dc4d51f0af Mon Sep 17 00:00:00 2001 From: Aapo Kyrola Date: Mon, 17 Apr 2017 21:23:45 -0700 Subject: Revert D4871248: [caffe2][PR] fp16 support for FullyConnected op Summary: This reverts commit 6a991c2c993dcf0b1e18aa3f2ffbe19e693dbadd Differential Revision: D4871248 fbshipit-source-id: b6d812d09a00c83e363432e84742c503abfed65b --- caffe2/utils/conversions.h | 182 ------------------ caffe2/utils/math-detail.h | 36 +--- caffe2/utils/math.h | 53 ++---- caffe2/utils/math_cpu.cc | 119 +++++------- caffe2/utils/math_gpu.cu | 423 +++++++++++------------------------------- caffe2/utils/math_gpu_test.cc | 57 ++++++ 6 files changed, 238 insertions(+), 632 deletions(-) delete mode 100644 caffe2/utils/conversions.h (limited to 'caffe2/utils') diff --git a/caffe2/utils/conversions.h b/caffe2/utils/conversions.h deleted file mode 100644 index 0c6c3238f7..0000000000 --- a/caffe2/utils/conversions.h +++ /dev/null @@ -1,182 +0,0 @@ -#pragma once - -#include - -#ifdef __CUDA_ARCH__ -#include -#endif - -#ifdef __CUDA_ARCH__ -#define CONVERSIONS_DECL __host__ __device__ inline -#else -#define CONVERSIONS_DECL inline -#endif - -namespace caffe2 { - -namespace convert { - -namespace { -inline float16 cpu_float2half_rn(float f) { - float16 ret; - - static_assert( - sizeof(unsigned int) == sizeof(float), - "Programming error sizeof(unsigned int) != sizeof(float)"); - - unsigned* xp = reinterpret_cast(&f); - unsigned x = *xp; - unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1; - unsigned sign, exponent, mantissa; - - // Get rid of +NaN/-NaN case first. - if (u > 0x7f800000) { - ret.x = 0x7fffU; - return ret; - } - - sign = ((x >> 16) & 0x8000); - - // Get rid of +Inf/-Inf, +0/-0. - if (u > 0x477fefff) { - ret.x = sign | 0x7c00U; - return ret; - } - if (u < 0x33000001) { - ret.x = (sign | 0x0000); - return ret; - } - - exponent = ((u >> 23) & 0xff); - mantissa = (u & 0x7fffff); - - if (exponent > 0x70) { - shift = 13; - exponent -= 0x70; - } else { - shift = 0x7e - exponent; - exponent = 0; - mantissa |= 0x800000; - } - lsb = (1 << shift); - lsb_s1 = (lsb >> 1); - lsb_m1 = (lsb - 1); - - // Round to nearest even. - remainder = (mantissa & lsb_m1); - mantissa >>= shift; - if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { - ++mantissa; - if (!(mantissa & 0x3ff)) { - ++exponent; - mantissa = 0; - } - } - - ret.x = (sign | (exponent << 10) | mantissa); - - return ret; -} - -inline float cpu_half2float(float16 h) { - unsigned sign = ((h.x >> 15) & 1); - unsigned exponent = ((h.x >> 10) & 0x1f); - unsigned mantissa = ((h.x & 0x3ff) << 13); - - if (exponent == 0x1f) { /* NaN or Inf */ - mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0); - exponent = 0xff; - } else if (!exponent) { /* Denorm or Zero */ - if (mantissa) { - unsigned int msb; - exponent = 0x71; - do { - msb = (mantissa & 0x400000); - mantissa <<= 1; /* normalize */ - --exponent; - } while (!msb); - mantissa &= 0x7fffff; /* 1.mantissa is implicit */ - } - } else { - exponent += 0x70; - } - - int temp = ((sign << 31) | (exponent << 23) | mantissa); - - unsigned* rp = reinterpret_cast(&temp); - return *rp; -} - -}; // anonymous -// general version: defer to static_cast -template -CONVERSIONS_DECL OUT To(const IN in) { - return static_cast(in); -} - -#if __CUDA_ARCH__ -__device__ __inline__ __half inf_clip(__half h) { - int isi = __hisinf(h); - if (isi > 0) { - // Exponent all ones except LSB (0x1e), mantissa is all ones (0x3ff) - h.x = 0x7bffU; - } else if (isi < 0) { - // As above, negated - h.x = 0x7bffU ^ 0x8000; - } - return h; -} -#endif - -// explicit for fp16 -template <> -CONVERSIONS_DECL float16 To(const float in) { -#if __CUDA_ARCH__ - // hacky interface between C2 fp16 and CUDA - float16 ret; - __half r; - // r.x = __float2half_rn(in); - // ret.x = inf_clip(r).x; - ret.x = __float2half(in).x; - return ret; -#else - return cpu_float2half_rn(in); -#endif -} - -template <> -CONVERSIONS_DECL float To(const float16 in) { -#if __CUDA_ARCH__ - __half tmp; - tmp.x = in.x; - return __half2float(tmp); -#else - return cpu_half2float(in); -#endif -}; - -template <> -CONVERSIONS_DECL float To(const float in) { - return in; -} - -template -CONVERSIONS_DECL OUT Get(IN x) { - return static_cast(x); -} - -template <> -CONVERSIONS_DECL float Get(float16 x) { - return To(x); -} - -template <> -CONVERSIONS_DECL float16 Get(float x) { - return To(x); -} - -}; // namespace convert - -}; // namespace caffe2 - -#undef CONVERSIONS_DECL diff --git a/caffe2/utils/math-detail.h b/caffe2/utils/math-detail.h index 07a1f997d6..35a880a6d4 100644 --- a/caffe2/utils/math-detail.h +++ b/caffe2/utils/math-detail.h @@ -11,12 +11,8 @@ namespace detail { template struct ScaleImpl { - inline void operator()( - const int N, - const float alpha, - const T* x, - T* y, - Context* context) { + inline void + operator()(const int N, const T alpha, const T* x, T* y, Context* context) { Scale(N, alpha, x, y, context); } }; @@ -26,7 +22,7 @@ template struct ScaleImpl { inline void operator()( const int N, - const float alpha, + const T alpha, const T* x, T* y, CPUContext* context) { @@ -37,12 +33,8 @@ struct ScaleImpl { template struct AxpyImpl { - inline void operator()( - const int N, - const float alpha, - const T* x, - T* y, - Context* context) { + inline void + operator()(const int N, const T alpha, const T* x, T* y, Context* context) { Axpy(N, alpha, x, y, context); } }; @@ -52,7 +44,7 @@ template struct AxpyImpl { inline void operator()( const int N, - const float alpha, + const T alpha, const T* x, T* y, CPUContext* context) { @@ -65,22 +57,14 @@ struct AxpyImpl { } // namespace detail template -inline void ScaleFixedSize( - const int N, - const float alpha, - const T* x, - T* y, - Context* context) { +inline void +ScaleFixedSize(const int N, const T alpha, const T* x, T* y, Context* context) { detail::ScaleImpl()(N, alpha, x, y, context); } template -inline void AxpyFixedSize( - const int N, - const float alpha, - const T* x, - T* y, - Context* context) { +inline void +AxpyFixedSize(const int N, const T alpha, const T* x, T* y, Context* context) { detail::AxpyImpl()(N, alpha, x, y, context); } diff --git a/caffe2/utils/math.h b/caffe2/utils/math.h index 105cb19733..a2472c0d33 100644 --- a/caffe2/utils/math.h +++ b/caffe2/utils/math.h @@ -141,20 +141,10 @@ void ColwiseMax(const int N, const int D, const T* x, T* y, // Decaf gemm provides a simpler interface to the gemm functions, with the // limitation that the data has to be contiguous in memory. -template -void Gemm( - const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, - const int M, - const int N, - const int K, - const float alpha, - const T* A, - const T* B, - const float beta, - T* C, - Context* context, - TensorProto::DataType math_type = TensorProto_DataType_FLOAT); +template +void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const T alpha, const T* A, + const T* B, const T beta, T* C, Context* context); // We also provide a gemm that has explicit lda, ldb and ldc specified. // In most cases you probably want to use the function above, though. @@ -179,18 +169,10 @@ void GemmEx( // to Trans, the output is: // CblasNoTrans: x is an N dim vector and y is an M dim vector. // CblasTrans: x is an M dim vector and y is an N dim vector. -template -void Gemv( - const CBLAS_TRANSPOSE TransA, - const int M, - const int N, - const float alpha, - const T* A, - const T* x, - const float beta, - T* y, - Context* context, - TensorProto::DataType math_type = TensorProto_DataType_FLOAT); +template +void Gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const T alpha, const T* A, const T* x, const T beta, + T* y, Context* context); template void Set(const TIndex N, const T alpha, T* X, Context* context); @@ -236,31 +218,28 @@ void Select(const int N, const int D, const T* x, const int* idx, T* y, Context* context); template -void Scale(const int N, const float alpha, const T* x, T* y, Context* context); +void Scale(const int N, const T alpha, const T* x, T* y, Context* context); // Different from the Scale function above, if alpha is passed in // as a pointer, we will assume that it lives on the Context device, // for example on GPU. template -void Scale(const int N, const float* alpha, const T* x, T* y, Context* context); +void Scale(const int N, const T* alpha, const T* x, T* y, + Context* context); template -void Axpy(const int N, const float alpha, const T* x, T* y, Context* context); +void Axpy(const int N, const T alpha, const T* x, T* y, Context* context); // Different from the Axpy function above, if alpha is passed in // as a pointer, we will assume that it lives on the Context device, // for example on GPU. template -void Axpy(const int N, const float* alpha, const T* x, T* y, Context* context); +void Axpy(const int N, const T* alpha, const T* x, T* y, + Context* context); template -void Axpby( - const int N, - const float alpha, - const T* x, - const T b, - T* y, - Context* context); +void Axpby(const int N, const T alpha, const T* x, const T b, T* y, + Context* context); template void Im2colNd( diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc index e4340dfaf1..5cac0c8339 100644 --- a/caffe2/utils/math_cpu.cc +++ b/caffe2/utils/math_cpu.cc @@ -58,18 +58,9 @@ namespace math { // CblasTrans, respectively, for each of A and B. template <> void Gemm( - const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, - const int M, - const int N, - const int K, - const float alpha, - const float* A, - const float* B, - const float beta, - float* C, - CPUContext* context, - TensorProto::DataType math_type) { + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, const float* A, + const float* B, const float beta, float* C, CPUContext* context) { auto C_mat = EigenMatrixMap(C, N, M); if (beta == 0) { C_mat.setZero(); @@ -187,8 +178,7 @@ void Gemv( const float* x, const float beta, float* y, - CPUContext* context, - TensorProto::DataType math_type) { + CPUContext* context) { EigenVectorMap y_vec(y, TransA == CblasNoTrans ? M : N); if (beta == 0) { // In Caffe2 we often do a lazy initialization, which may contain NaNs in @@ -215,22 +205,19 @@ void Gemv( } } -#define CAFFE2_SPECIALIZED_SCALE(T) \ - template <> \ - void Scale( \ - const int n, const float alpha, const T* x, T* y, CPUContext* context) { \ - EigenVectorMap(y, n) = ConstEigenVectorMap(x, n) * alpha; \ - } \ - template <> \ - void Scale( \ - const int n, \ - const float* alpha, \ - const T* x, \ - T* y, \ - CPUContext* context) { \ - EigenVectorMap(y, n) = ConstEigenVectorMap(x, n) * (*alpha); \ +#define CAFFE2_SPECIALIZED_SCALE(T) \ + template <> \ + void Scale( \ + const int n, const T alpha, const T* x, T* y, CPUContext* context) { \ + EigenVectorMap(y, n) = ConstEigenVectorMap(x, n) * alpha; \ + } \ + template <> \ + void Scale( \ + const int n, const T* alpha, const T* x, T* y, CPUContext* context) { \ + EigenVectorMap(y, n) = ConstEigenVectorMap(x, n) * (*alpha); \ } CAFFE2_SPECIALIZED_SCALE(float) +CAFFE2_SPECIALIZED_SCALE(double) #undef CAFFE2_SPECIALIZED_SCALE #define CAFFE2_SPECIALIZED_DOT(T) \ @@ -241,6 +228,7 @@ void Dot( \ *y = ConstEigenVectorMap(a, N).dot(ConstEigenVectorMap(b, N)); \ } CAFFE2_SPECIALIZED_DOT(float) +CAFFE2_SPECIALIZED_DOT(double) #undef CAFFE2_SPECIALIZED_DOT #define CAFFE2_SPECIALIZED_AXPY(T) \ @@ -255,6 +243,7 @@ CAFFE2_SPECIALIZED_DOT(float) EigenVectorMap(Y, N) += ConstEigenVectorMap(x, N) * (*alpha); \ } CAFFE2_SPECIALIZED_AXPY(float) +CAFFE2_SPECIALIZED_AXPY(double) #undef CAFFE2_SPECIALIZED_AXPY #define CAFFE2_SPECIALIZED_AXPBY(T) \ @@ -265,24 +254,16 @@ void Axpby(const int N, const T alpha, const T* x, \ y_vec = y_vec * beta + ConstEigenVectorMap(x, N) * alpha; \ } CAFFE2_SPECIALIZED_AXPBY(float) +CAFFE2_SPECIALIZED_AXPBY(double) #undef CAFFE2_SPECIALIZED_AXPBY #else // CAFFE2_USE_EIGEN_FOR_BLAS template <> void Gemm( - const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, - const int M, - const int N, - const int K, - const float alpha, - const float* A, - const float* B, - const float beta, - float* C, - CPUContext* context, - TensorProto::DataType math_type) { + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, const float* A, + const float* B, const float beta, float* C, CPUContext* context) { int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, @@ -311,39 +292,29 @@ void GemmEx( template <> void Gemv( - const CBLAS_TRANSPOSE TransA, - const int M, - const int N, - const float alpha, - const float* A, - const float* x, - const float beta, - float* y, - CPUContext* context, - TensorProto::DataType math_type) { + const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, + const float* A, const float* x, const float beta, float* y, + CPUContext* context) { cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); } -#define CAFFE2_SPECIALIZED_SCALE(T, prefix) \ - template <> \ - void Scale( \ - const int n, const float alpha, const T* x, T* y, CPUContext* context) { \ - if (y != x) \ - cblas_##prefix##copy(n, x, 1, y, 1); \ - cblas_##prefix##scal(n, static_cast(alpha), y, 1); \ - } \ - template <> \ - void Scale( \ - const int n, \ - const float* alpha, \ - const T* x, \ - T* y, \ - CPUContext* context) { \ - if (y != x) \ - cblas_##prefix##copy(n, x, 1, y, 1); \ - cblas_##prefix##scal(n, static_cast(*alpha), y, 1); \ +#define CAFFE2_SPECIALIZED_SCALE(T, prefix) \ + template <> \ + void Scale( \ + const int n, const T alpha, const T* x, T* y, CPUContext* context) { \ + if (y != x) \ + cblas_##prefix##copy(n, x, 1, y, 1); \ + cblas_##prefix##scal(n, alpha, y, 1); \ + } \ + template <> \ + void Scale( \ + const int n, const T* alpha, const T* x, T* y, CPUContext* context) { \ + if (y != x) \ + cblas_##prefix##copy(n, x, 1, y, 1); \ + cblas_##prefix##scal(n, *alpha, y, 1); \ } CAFFE2_SPECIALIZED_SCALE(float, s) +CAFFE2_SPECIALIZED_SCALE(double, d) #undef CAFFE2_SPECIALIZED_SCALE #define CAFFE2_SPECIALIZED_DOT(T, prefix) \ @@ -354,6 +325,7 @@ void Dot( \ *y = cblas_##prefix##dot(N, a, 1, b, 1); \ } CAFFE2_SPECIALIZED_DOT(float, s) +CAFFE2_SPECIALIZED_DOT(double, d) #undef CAFFE2_SPECIALIZED_DOT #define CAFFE2_SPECIALIZED_AXPY(T, prefix) \ @@ -368,6 +340,7 @@ CAFFE2_SPECIALIZED_DOT(float, s) cblas_##prefix##axpy(N, *alpha, x, 1, y, 1); \ } CAFFE2_SPECIALIZED_AXPY(float, s) +CAFFE2_SPECIALIZED_AXPY(double, d) #undef CAFFE2_SPECIALIZED_AXPY // cblas_[sd]axpby is not a standard blas function, and if MKL is not present, @@ -389,6 +362,7 @@ void Axpby(const int N, const T alpha, const T* x, \ } #endif // CAFFE2_USE_MKL CAFFE2_SPECIALIZED_AXPBY(float, s) +CAFFE2_SPECIALIZED_AXPBY(double, d) #undef CAFFE2_SPECIALIZED_AXPBY #endif // CAFFE2_USE_EIGEN_FOR_BLAS @@ -462,8 +436,11 @@ void Funcname(const int N, const T* x, T* y, \ EigenVectorMap(y, N) = ConstEigenVectorMap(x, N).array().expr(); \ } DELEGATE_SIMPLE_UNARY_FUNCTION(float, Exp, exp) +DELEGATE_SIMPLE_UNARY_FUNCTION(double, Exp, exp) DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, log) +DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log, log) DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, square) +DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqr, square) #undef DELEGATE_SIMPLE_UNARY_FUNCTION #define DELEGATE_POWX_FUNCTION(T) \ @@ -473,6 +450,7 @@ void Powx( \ EigenVectorMap(y, N) = ConstEigenVectorMap(a, N).array().pow(b); \ } DELEGATE_POWX_FUNCTION(float) +DELEGATE_POWX_FUNCTION(double) #undef DELEGATE_POWX_FUNCTION #endif // CAFFE2_USE_MKL @@ -498,6 +476,7 @@ EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr) #define DEFINE_SIMPLE_BINARY_FUNCTION(Funcname, expr) \ EIGEN_SIMPLE_BINARY_FUNCTION(float, Funcname, expr) \ +EIGEN_SIMPLE_BINARY_FUNCTION(double, Funcname, expr) \ EIGEN_SIMPLE_BINARY_FUNCTION(int32_t, Funcname, expr) \ EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr) @@ -567,6 +546,7 @@ CAFFE2_SPECIALIZED_COLWISEMAX(float) DELEGATE_BROADCAST_BINARY_FUNCTION(int32_t, name, op) \ DELEGATE_BROADCAST_BINARY_FUNCTION(int64_t, name, op) \ DELEGATE_BROADCAST_BINARY_FUNCTION(float, name, op) \ + DELEGATE_BROADCAST_BINARY_FUNCTION(double, name, op) DEFINE_BROADCAST_BINARY_FUNCTION(Add, +) DEFINE_BROADCAST_BINARY_FUNCTION(Sub, -) @@ -622,6 +602,7 @@ CAFFE2_SPECIALIZED_SET(uint16_t); #define CAFFE2_DEFINE_BINARY_OP(name, op) \ CAFFE2_INSTANTIATE_BINARY_OP(name, op, float) \ + CAFFE2_INSTANTIATE_BINARY_OP(name, op, double) \ CAFFE2_INSTANTIATE_BINARY_OP(name, op, int32_t) \ CAFFE2_INSTANTIATE_BINARY_OP(name, op, int64_t) @@ -663,6 +644,7 @@ void Not( } CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(float); +CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(double); #undef CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH template <> @@ -735,6 +717,7 @@ void RandGaussian( } CAFFE2_SPECIALIZED_SUM(float); +CAFFE2_SPECIALIZED_SUM(double); CAFFE2_SPECIALIZED_SUM(int32_t); CAFFE2_SPECIALIZED_SUM(int64_t); diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu index 46c5bc02f1..cfbe91b5d0 100644 --- a/caffe2/utils/math_gpu.cu +++ b/caffe2/utils/math_gpu.cu @@ -5,9 +5,8 @@ #include #include -#include "caffe2/core/context_gpu.h" -#include "caffe2/utils/conversions.h" #include "caffe2/utils/math.h" +#include "caffe2/core/context_gpu.h" #if THRUST_VERSION >= 100800 #define THRUST_SUPPORTS_PER_THREAD @@ -33,30 +32,33 @@ void Funcname( \ } DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp, expf); +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Exp, exp); DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log, logf); +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Log, log); __device__ float cuda_sqrf(const float x) { return x * x; } +__device__ double cuda_sqr(const double x) { return x * x; } DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr, cuda_sqrf); +DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Sqr, cuda_sqr); #undef DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION -#define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(T, Funcname, expr) \ - __global__ void _Kernel_##T##_##Funcname( \ - const int N, const T* a, const T* b, T* y) { \ - CUDA_1D_KERNEL_LOOP(i, N) { \ - float r = convert::To(a[i]) expr convert::To(b[i]); \ - y[i] = convert::To(r); \ - } \ - } \ - template <> \ - void Funcname( \ - const int N, const T* a, const T* b, T* y, CUDAContext* context) { \ - _Kernel_##T##_##Funcname<<< \ - CAFFE_GET_BLOCKS(N), \ - CAFFE_CUDA_NUM_THREADS, \ - 0, \ - context->cuda_stream()>>>(N, a, b, y); \ +#define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(T, Funcname, expr) \ + __global__ void _Kernel_##T##_##Funcname( \ + const int N, const T* a, const T* b, T* y) { \ + CUDA_1D_KERNEL_LOOP(i, N) { \ + y[i] = a[i] expr b[i]; \ + } \ + } \ + template <> \ + void Funcname( \ + const int N, const T* a, const T* b, T* y, CUDAContext* context) { \ + _Kernel_##T##_##Funcname<<< \ + CAFFE_GET_BLOCKS(N), \ + CAFFE_CUDA_NUM_THREADS, \ + 0, \ + context->cuda_stream()>>>(N, a, b, y); \ } DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Add, +); @@ -64,27 +66,13 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Sub, -); DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Mul, *); DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Div, /); -DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, Add, +); -DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, Sub, -); -DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, Mul, *); -DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, Div, /); - // Caffe2 gemm provides a simpler interface to the gemm functions, with the // limitation that the data has to be contiguous in memory. template <> void Gemm( - const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, - const int M, - const int N, - const int K, - const float alpha, - const float* A, - const float* B, - const float beta, - float* C, - CUDAContext* context, - TensorProto::DataType math_type) { + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, const float* A, + const float* B, const float beta, float* C, CUDAContext* context) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (TransA == CblasNoTrans) ? K : M; @@ -110,92 +98,12 @@ void Gemm( N)); } -template <> -void Gemm( - const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, - const int M, - const int N, - const int K, - const float alpha, - const float16* A, - const float16* B, - const float beta, - float16* C, - CUDAContext* context, - TensorProto::DataType math_type) { - // Note that cublas follows fortran order, so the order is different from - // the cblas convention. - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - cublasOperation_t cuTransA = - (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = - (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - if (math_type == TensorProto_DataType_FLOAT) { - CUBLAS_CHECK(cublasSgemmEx( - context->cublas_handle(), - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - CUDA_R_16F, - ldb, - A, - CUDA_R_16F, - lda, - &beta, - C, - CUDA_R_16F, - N)); - - } else if (math_type == TensorProto_DataType_FLOAT16) { - // convert alpha, beta from caffe2::float16 -> __half - __half alpha_fp16; - alpha_fp16.x = convert::To(alpha).x; - __half beta_fp16; - beta_fp16.x = convert::To(beta).x; - // call cublasHgemm - CUBLAS_CHECK(cublasHgemm( - context->cublas_handle(), - cuTransB, - cuTransA, - N, - M, - K, - &alpha_fp16, - (const __half*)B, - ldb, - (const __half*)A, - lda, - &beta_fp16, - (__half*)C, - N)); - } else { - // fail - CAFFE_THROW("Unsupported math type"); - } -} - template <> void GemmEx( - const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, - const int M, - const int N, - const int K, - const float alpha, - const float* A, - const int lda, - const float* B, - const int ldb, - const float beta, - float* C, - const int ldc, - CUDAContext* context) { + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, const float* A, + const int lda, const float* B, const int ldb, const float beta, float* C, + const int ldc, CUDAContext* context) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. cublasOperation_t cuTransA = @@ -221,19 +129,40 @@ void GemmEx( template <> void Gemv( + const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, + const float* A, const float* x, const float beta, float* y, + CUDAContext* context) { + cublasOperation_t cuTransA = + (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_ENFORCE(cublasSgemv( + context->cublas_handle(), + cuTransA, + N, + M, + &alpha, + A, + N, + x, + 1, + &beta, + y, + 1)); +} + +template <> +void Gemv( const CBLAS_TRANSPOSE TransA, const int M, const int N, - const float alpha, - const float* A, - const float* x, - const float beta, - float* y, - CUDAContext* context, - TensorProto::DataType math_type) { + const double alpha, + const double* A, + const double* x, + const double beta, + double* y, + CUDAContext* context) { cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_ENFORCE(cublasSgemv( + CUBLAS_ENFORCE(cublasDgemv( context->cublas_handle(), cuTransA, N, @@ -287,73 +216,6 @@ CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float); CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(double); #undef CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH -template <> -void Gemv( - const CBLAS_TRANSPOSE TransA, - const int M, - const int N, - const float alpha, - const float16* A, - const float16* x, - const float beta, - float16* y, - CUDAContext* context, - TensorProto::DataType math_type) { - cublasOperation_t cuTransA = - (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; - - // sort out what we need to call cublasSgemmEx / cublasHgemm - int m = (cuTransA == CUBLAS_OP_N) ? N : M; - int k = (cuTransA == CUBLAS_OP_N) ? M : N; - int LDA = (cuTransA == CUBLAS_OP_N) ? m : k; - int LDC = m; - - if (math_type == TensorProto_DataType_FLOAT) { - CUBLAS_CHECK(cublasSgemmEx( - context->cublas_handle(), - cuTransA, - CUBLAS_OP_N, - m, - 1, - k, - &alpha, - A, - CUDA_R_16F, - LDA, - x, - CUDA_R_16F, - k, - &beta, - y, - CUDA_R_16F, - LDC)); - } else if (math_type == TensorProto_DataType_FLOAT16) { - __half alpha_fp16; - alpha_fp16.x = convert::To(alpha).x; - __half beta_fp16; - beta_fp16.x = convert::To(beta).x; - - CUBLAS_CHECK(cublasHgemm( - context->cublas_handle(), - cuTransA, - CUBLAS_OP_N, - m, - 1, - k, - &alpha_fp16, - (const __half*)A, - LDA, - (const __half*)x, - k, - &beta_fp16, - (__half*)y, - LDC)); - } else { - // fail - CAFFE_THROW("Unsupported math type"); - } -} - namespace { template __global__ void SetKernel(const int N, const T alpha, T* Y) { @@ -376,7 +238,6 @@ CAFFE2_SPECIALIZED_CUDA_SET(double); CAFFE2_SPECIALIZED_CUDA_SET(bool); CAFFE2_SPECIALIZED_CUDA_SET(int8_t); CAFFE2_SPECIALIZED_CUDA_SET(int16_t); -CAFFE2_SPECIALIZED_CUDA_SET(float16); CAFFE2_SPECIALIZED_CUDA_SET(int); CAFFE2_SPECIALIZED_CUDA_SET(int64_t); CAFFE2_SPECIALIZED_CUDA_SET(char); @@ -386,11 +247,11 @@ CAFFE2_SPECIALIZED_CUDA_SET(uint16_t); namespace { template -__global__ void -UniformShift(const int N, const float min, const float max, T* x) { - float scale = max - min; +__global__ void UniformShift(const int N, const T min, const T max, + T* x) { + T scale = max - min; CUDA_1D_KERNEL_LOOP(i, N) { - x[i] = convert::To(convert::To(x[i]) * scale + min); + x[i] = x[i] * scale + min; } } @@ -475,6 +336,7 @@ void RandGaussian( context->curand_generator(), r, even_n, mean, std)); } + template<> void Dot( const int n, const float* a, const float* b, float* y, @@ -484,28 +346,13 @@ void Dot( context->Copy(1, &result, y); } -template <> -void Dot( - const int n, - const float16* a, - const float16* b, - float16* y, +template<> +void Dot( + const int n, const double* a, const double* b, double* y, CUDAContext* context) { - float16 result; - // execute with 32-bit math - CUBLAS_CHECK(cublasDotEx( - context->cublas_handle(), - n, - a, - CUDA_R_16F, - 1, - b, - CUDA_R_16F, - 1, - &result, - CUDA_R_16F, - CUDA_R_32F)); - context->Copy(1, &result, y); + double result; + CUBLAS_ENFORCE(cublasDdot(context->cublas_handle(), n, a, 1, b, 1, y)); + context->Copy(1, &result, y); } // A previous version of caffe2 used Thrust but it turns out that thrust @@ -516,7 +363,7 @@ void Dot( template __global__ void SumKernel(const int N, const T* X, T* Y, bool square) { const int idx = threadIdx.x; - __shared__ float reduction_buffer[SUM_KERNEL_NTHREADS]; + __shared__ T reduction_buffer[SUM_KERNEL_NTHREADS]; reduction_buffer[idx] = 0; @@ -524,12 +371,11 @@ __global__ void SumKernel(const int N, const T* X, T* Y, bool square) { // N -> 128 if (!square) { for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) { - reduction_buffer[idx] += convert::To(X[i]); + reduction_buffer[idx] += X[i]; } } else { for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) { - float Xi = convert::To(X[i]); - reduction_buffer[idx] += Xi * Xi; + reduction_buffer[idx] += X[i] * X[i]; } } __syncthreads(); @@ -547,7 +393,7 @@ __global__ void SumKernel(const int N, const T* X, T* Y, bool square) { for (int i = 0; i < 32; ++i) { tmp += reduction_buffer[i]; } - *Y = convert::To(tmp); + *Y = tmp; } } @@ -560,7 +406,7 @@ __global__ void SumKernel(const int N, const T* X, T* Y, bool square) { } CAFFE2_MATH_SUM_FUNC(float) -CAFFE2_MATH_SUM_FUNC(float16) +CAFFE2_MATH_SUM_FUNC(double) #undef CAFFE2_MATH_SUM_FUNC #define CAFFE2_MATH_SUMSQR_FUNC(T) \ @@ -592,33 +438,18 @@ void Select( 0, context->cuda_stream()>>>(N, D, x, idx, y); } -template <> -void Select( - const int N, - const int D, - const float16* x, - const int* idx, - float16* y, - CUDAContext* context) { - SelectKernel<<< - CAFFE_GET_BLOCKS(N), - CAFFE_CUDA_NUM_THREADS, - 0, - context->cuda_stream()>>>(N, D, x, idx, y); -} - namespace { template -__global__ void ScaleKernel(const int n, const float alpha, const T* x, T* y) { +__global__ void ScaleKernel( + const int n, const T alpha, const T* x, T* y) { CUDA_1D_KERNEL_LOOP(i, n) { - // y[i] = convert::To(convert::To(x[i]) * alpha); - y[i] = convert::Get(convert::Get(x[i]) * alpha); + y[i] = x[i] * alpha; } } template -__global__ void -ScaleKernelDeviceAlpha(const int n, const float* alpha, const T* x, T* y) { +__global__ void ScaleKernelDeviceAlpha( + const int n, const T* alpha, const T* x, T* y) { CUDA_1D_KERNEL_LOOP(i, n) { y[i] = x[i] * (*alpha); } @@ -630,20 +461,6 @@ __global__ void PowKernel(const int n, const T* x, const T exponent, T* y) { y[i] = powf(x[i], exponent); } } - -// fp16 specialization -template <> -__global__ void ScaleKernelDeviceAlpha( - const int n, - const float* alpha, - const float16* x, - float16* y) { - CUDA_1D_KERNEL_LOOP(i, n) { - y[i] = convert::To( - convert::To(x[i]) * (*alpha)); - } -} - } // namespace template <> @@ -672,17 +489,12 @@ void Scale( } template <> -void Scale( - const int n, - const float alpha, - const float16* x, - float16* y, +void Scale( + const int n, const double alpha, const double *x, double* y, CUDAContext* context) { - ScaleKernel<<< - CAFFE_GET_BLOCKS(n), - CAFFE_CUDA_NUM_THREADS, - 0, - context->cuda_stream()>>>(n, alpha, x, y); + ScaleKernel<<< + CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( + n, alpha, x, y); } template <> @@ -695,17 +507,11 @@ void Scale( } template <> -void Scale( - const int n, - const float* alpha, - const float16* x, - float16* y, +void Scale( + const int n, const double* alpha, const double *x, double* y, CUDAContext* context) { - ScaleKernelDeviceAlpha<<< - CAFFE_GET_BLOCKS(n), - CAFFE_CUDA_NUM_THREADS, - 0, - context->cuda_stream()>>>(n, alpha, x, y); + ScaleKernelDeviceAlpha<<cuda_stream()>>>(n, alpha, x, y); } template <> @@ -721,42 +527,18 @@ void Axpy( template <> void Axpy( const int N, - const float alpha, + const double alpha, const double* X, double* Y, CUDAContext* context) { - double alpha_d{alpha}; - CUBLAS_ENFORCE( - cublasDaxpy(context->cublas_handle(), N, &alpha_d, X, 1, Y, 1)); -} - -template <> -void Axpy( - const int N, - const float alpha, - const float16* X, - float16* Y, - CUDAContext* context) { - CUBLAS_CHECK(cublasAxpyEx( - context->cublas_handle(), - N, - &alpha, - CUDA_R_16F, - X, - CUDA_R_16F, - 1, - Y, - CUDA_R_16F, - 1, - CUDA_R_32F)); + CUBLAS_ENFORCE(cublasDaxpy(context->cublas_handle(), N, &alpha, X, 1, Y, 1)); } namespace { template -__global__ void AxpyKernel(const int n, const float* a, const T* x, T* y) { +__global__ void AxpyKernel(const int n, const T* a, const T* x, T* y) { CUDA_1D_KERNEL_LOOP(index, n) { - y[index] = convert::Get( - convert::Get(x[index]) * (*a) + convert::Get(y[index])); + y[index] += x[index] * (*a); } } } // namespace @@ -770,19 +552,14 @@ void Axpy( } template <> -void Axpy( - const int n, - const float* alpha, - const float16* X, - float16* Y, - CUDAContext* context) { - AxpyKernel<<< - CAFFE_GET_BLOCKS(n), - CAFFE_CUDA_NUM_THREADS, - 0, - context->cuda_stream()>>>(n, alpha, X, Y); +void Axpy( + const int n, const double* alpha, const double* X, + double* Y, CUDAContext* context) { + AxpyKernel<<cuda_stream()>>>(n, alpha, X, Y); } + namespace { template __global__ void AxpbyKernel(const int n, const T a, const T* x, @@ -801,6 +578,14 @@ void Axpby( 0, context->cuda_stream()>>>(n, a, x, b, y); } +template <> +void Axpby( + const int n, const double a, const double* x, const double b, double* y, + CUDAContext* context) { + AxpbyKernel<<cuda_stream()>>>(n, a, x, b, y); +} + namespace { template diff --git a/caffe2/utils/math_gpu_test.cc b/caffe2/utils/math_gpu_test.cc index b1f930bec0..2ceeddd355 100644 --- a/caffe2/utils/math_gpu_test.cc +++ b/caffe2/utils/math_gpu_test.cc @@ -67,4 +67,61 @@ TEST(MathUtilGPUTest, testAddStripedBatch) { } } +#define TEST_GEMV_WITH_TYPE(field_name) \ + TEST(MathUtilGPUTest, testGemv_##field_name) { \ + if (!HasCudaGPU()) \ + return; \ + Workspace ws; \ + DeviceOption option; \ + option.set_device_type(CUDA); \ + CUDAContext context(option); \ + Blob* blobx = ws.CreateBlob("X"); \ + Blob* bloby = ws.CreateBlob("Y"); \ + Blob* blobz = ws.CreateBlob("Z"); \ + Blob* bloby_host = ws.CreateBlob("Y_host"); \ + \ + vector shapex{64, 128}; \ + vector shapey{64}; \ + vector shapez{128}; \ + \ + auto* tensorx = blobx->GetMutable>(); \ + tensorx->Resize(shapex); \ + math::Set( \ + 64 * 128, \ + (field_name)1.0, \ + tensorx->mutable_data(), \ + &context); \ + \ + auto* tensory = bloby->GetMutable>(); \ + tensory->Resize(shapey); \ + math::Set( \ + 64, (field_name)1.0, tensory->mutable_data(), &context); \ + \ + auto* tensorz = blobz->GetMutable>(); \ + tensorz->Resize(shapez); \ + \ + math::Gemv( \ + CblasTrans, \ + 64, \ + 128, \ + 1.0, \ + tensorx->template data(), \ + tensory->mutable_data(), \ + 0.0, \ + tensorz->template mutable_data(), \ + &context); \ + context.FinishDeviceComputation(); \ + \ + auto* tensory_host = bloby_host->GetMutable>(); \ + tensory_host->CopyFrom(*tensorz, &context); \ + context.FinishDeviceComputation(); \ + \ + for (int i = 0; i < 128; i++) { \ + EXPECT_EQ(tensory_host->data()[i], 64.0); \ + } \ + } + +TEST_GEMV_WITH_TYPE(float); +TEST_GEMV_WITH_TYPE(double); + } // namespace caffe2 -- cgit v1.2.3