From 9ab077dc9d0bbe651348a498dd5472dc4d51f0af Mon Sep 17 00:00:00 2001
From: Aapo Kyrola <akyrola@fb.com>
Date: Mon, 17 Apr 2017 21:23:45 -0700
Subject: Revert D4871248: [caffe2][PR] fp16 support for FullyConnected op

Summary: This reverts commit 6a991c2c993dcf0b1e18aa3f2ffbe19e693dbadd

Differential Revision: D4871248

fbshipit-source-id: b6d812d09a00c83e363432e84742c503abfed65b
---
 caffe2/utils/conversions.h    | 182 ------------------
 caffe2/utils/math-detail.h    |  36 +---
 caffe2/utils/math.h           |  53 ++----
 caffe2/utils/math_cpu.cc      | 119 +++++-------
 caffe2/utils/math_gpu.cu      | 423 +++++++++++-------------------------------
 caffe2/utils/math_gpu_test.cc |  57 ++++++
 6 files changed, 238 insertions(+), 632 deletions(-)
 delete mode 100644 caffe2/utils/conversions.h

(limited to 'caffe2/utils')
diff --git a/caffe2/utils/conversions.h b/caffe2/utils/conversions.h
deleted file mode 100644
index 0c6c3238f7..0000000000
--- a/caffe2/utils/conversions.h
+++ /dev/null
@@ -1,182 +0,0 @@
-#pragma once
-
-#include <caffe2/core/types.h>
-
-#ifdef __CUDA_ARCH__
-#include <cuda_fp16.h>
-#endif
-
-#ifdef __CUDA_ARCH__
-#define CONVERSIONS_DECL __host__ __device__ inline
-#else
-#define CONVERSIONS_DECL inline
-#endif
-
-namespace caffe2 {
-
-namespace convert {
-
-namespace {
-inline float16 cpu_float2half_rn(float f) {
-  float16 ret;
-
-  static_assert(
-      sizeof(unsigned int) == sizeof(float),
-      "Programming error sizeof(unsigned int) != sizeof(float)");
-
-  unsigned* xp = reinterpret_cast<unsigned int*>(&f);
-  unsigned x = *xp;
-  unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
-  unsigned sign, exponent, mantissa;
-
-  // Get rid of +NaN/-NaN case first.
-  if (u > 0x7f800000) {
-    ret.x = 0x7fffU;
-    return ret;
-  }
-
-  sign = ((x >> 16) & 0x8000);
-
-  // Get rid of +Inf/-Inf, +0/-0.
-  if (u > 0x477fefff) {
-    ret.x = sign | 0x7c00U;
-    return ret;
-  }
-  if (u < 0x33000001) {
-    ret.x = (sign | 0x0000);
-    return ret;
-  }
-
-  exponent = ((u >> 23) & 0xff);
-  mantissa = (u & 0x7fffff);
-
-  if (exponent > 0x70) {
-    shift = 13;
-    exponent -= 0x70;
-  } else {
-    shift = 0x7e - exponent;
-    exponent = 0;
-    mantissa |= 0x800000;
-  }
-  lsb = (1 << shift);
-  lsb_s1 = (lsb >> 1);
-  lsb_m1 = (lsb - 1);
-
-  // Round to nearest even.
-  remainder = (mantissa & lsb_m1);
-  mantissa >>= shift;
-  if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
-    ++mantissa;
-    if (!(mantissa & 0x3ff)) {
-      ++exponent;
-      mantissa = 0;
-    }
-  }
-
-  ret.x = (sign | (exponent << 10) | mantissa);
-
-  return ret;
-}
-
-inline float cpu_half2float(float16 h) {
-  unsigned sign = ((h.x >> 15) & 1);
-  unsigned exponent = ((h.x >> 10) & 0x1f);
-  unsigned mantissa = ((h.x & 0x3ff) << 13);
-
-  if (exponent == 0x1f) { /* NaN or Inf */
-    mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
-    exponent = 0xff;
-  } else if (!exponent) { /* Denorm or Zero */
-    if (mantissa) {
-      unsigned int msb;
-      exponent = 0x71;
-      do {
-        msb = (mantissa & 0x400000);
-        mantissa <<= 1; /* normalize */
-        --exponent;
-      } while (!msb);
-      mantissa &= 0x7fffff; /* 1.mantissa is implicit */
-    }
-  } else {
-    exponent += 0x70;
-  }
-
-  int temp = ((sign << 31) | (exponent << 23) | mantissa);
-
-  unsigned* rp = reinterpret_cast<unsigned*>(&temp);
-  return *rp;
-}
-
-}; // anonymous
-// general version: defer to static_cast
-template <typename IN, typename OUT>
-CONVERSIONS_DECL OUT To(const IN in) {
-  return static_cast<OUT>(in);
-}
-
-#if __CUDA_ARCH__
-__device__ __inline__ __half inf_clip(__half h) {
-  int isi = __hisinf(h);
-  if (isi > 0) {
-    // Exponent all ones except LSB (0x1e), mantissa is all ones (0x3ff)
-    h.x = 0x7bffU;
-  } else if (isi < 0) {
-    // As above, negated
-    h.x = 0x7bffU ^ 0x8000;
-  }
-  return h;
-}
-#endif
-
-// explicit for fp16
-template <>
-CONVERSIONS_DECL float16 To(const float in) {
-#if __CUDA_ARCH__
-  // hacky interface between C2 fp16 and CUDA
-  float16 ret;
-  __half r;
-  // r.x = __float2half_rn(in);
-  // ret.x = inf_clip(r).x;
-  ret.x = __float2half(in).x;
-  return ret;
-#else
-  return cpu_float2half_rn(in);
-#endif
-}
-
-template <>
-CONVERSIONS_DECL float To(const float16 in) {
-#if __CUDA_ARCH__
-  __half tmp;
-  tmp.x = in.x;
-  return __half2float(tmp);
-#else
-  return cpu_half2float(in);
-#endif
-};
-
-template <>
-CONVERSIONS_DECL float To(const float in) {
-  return in;
-}
-
-template <typename OUT, typename IN>
-CONVERSIONS_DECL OUT Get(IN x) {
-  return static_cast<OUT>(x);
-}
-
-template <>
-CONVERSIONS_DECL float Get(float16 x) {
-  return To<float16, float>(x);
-}
-
-template <>
-CONVERSIONS_DECL float16 Get(float x) {
-  return To<float, float16>(x);
-}
-
-}; // namespace convert
-
-}; // namespace caffe2
-
-#undef CONVERSIONS_DECL
diff --git a/caffe2/utils/math-detail.h b/caffe2/utils/math-detail.h
index 07a1f997d6..35a880a6d4 100644
--- a/caffe2/utils/math-detail.h
+++ b/caffe2/utils/math-detail.h
@@ -11,12 +11,8 @@ namespace detail {
 
 template<typename T, class Context, int FixedSize>
 struct ScaleImpl {
-  inline void operator()(
-      const int N,
-      const float alpha,
-      const T* x,
-      T* y,
-      Context* context) {
+  inline void
+  operator()(const int N, const T alpha, const T* x, T* y, Context* context) {
     Scale(N, alpha, x, y, context);
   }
 };
@@ -26,7 +22,7 @@ template<typename T>
 struct ScaleImpl<T, CPUContext, 1> {
   inline void operator()(
       const int N,
-      const float alpha,
+      const T alpha,
       const T* x,
       T* y,
       CPUContext* context) {
@@ -37,12 +33,8 @@ struct ScaleImpl<T, CPUContext, 1> {
 
 template<typename T, class Context, int FixedSize>
 struct AxpyImpl {
-  inline void operator()(
-      const int N,
-      const float alpha,
-      const T* x,
-      T* y,
-      Context* context) {
+  inline void
+  operator()(const int N, const T alpha, const T* x, T* y, Context* context) {
     Axpy(N, alpha, x, y, context);
   }
 };
@@ -52,7 +44,7 @@ template<typename T>
 struct AxpyImpl<T, CPUContext, 1> {
   inline void operator()(
       const int N,
-      const float alpha,
+      const T alpha,
       const T* x,
       T* y,
       CPUContext* context) {
@@ -65,22 +57,14 @@ struct AxpyImpl<T, CPUContext, 1> {
 }  // namespace detail
 
 template <typename T, class Context, int FixedSize>
-inline void ScaleFixedSize(
-    const int N,
-    const float alpha,
-    const T* x,
-    T* y,
-    Context* context) {
+inline void
+ScaleFixedSize(const int N, const T alpha, const T* x, T* y, Context* context) {
   detail::ScaleImpl<T, Context, FixedSize>()(N, alpha, x, y, context);
 }
 
 template <typename T, class Context, int FixedSize>
-inline void AxpyFixedSize(
-    const int N,
-    const float alpha,
-    const T* x,
-    T* y,
-    Context* context) {
+inline void
+AxpyFixedSize(const int N, const T alpha, const T* x, T* y, Context* context) {
   detail::AxpyImpl<T, Context, FixedSize>()(N, alpha, x, y, context);
 }
 
diff --git a/caffe2/utils/math.h b/caffe2/utils/math.h
index 105cb19733..a2472c0d33 100644
--- a/caffe2/utils/math.h
+++ b/caffe2/utils/math.h
@@ -141,20 +141,10 @@ void ColwiseMax(const int N, const int D, const T* x, T* y,
 
 // Decaf gemm provides a simpler interface to the gemm functions, with the
 // limitation that the data has to be contiguous in memory.
-template <typename T, class Context, class Engine = DefaultEngine>
-void Gemm(
-    const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha,
-    const T* A,
-    const T* B,
-    const float beta,
-    T* C,
-    Context* context,
-    TensorProto::DataType math_type = TensorProto_DataType_FLOAT);
+template <typename T, class Context, class Engine=DefaultEngine>
+void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+    const int M, const int N, const int K, const T alpha, const T* A,
+    const T* B, const T beta, T* C, Context* context);
 
 // We also provide a gemm that has explicit lda, ldb and ldc specified.
 // In most cases you probably want to use the function above, though.
@@ -179,18 +169,10 @@ void GemmEx(
 // to Trans, the output is:
 // CblasNoTrans: x is an N dim vector and y is an M dim vector.
 // CblasTrans:   x is an M dim vector and y is an N dim vector.
-template <typename T, class Context, class Engine = DefaultEngine>
-void Gemv(
-    const CBLAS_TRANSPOSE TransA,
-    const int M,
-    const int N,
-    const float alpha,
-    const T* A,
-    const T* x,
-    const float beta,
-    T* y,
-    Context* context,
-    TensorProto::DataType math_type = TensorProto_DataType_FLOAT);
+template <typename T, class Context, class Engine=DefaultEngine>
+void Gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+    const T alpha, const T* A, const T* x, const T beta,
+    T* y, Context* context);
 
 template <typename T, class Context>
 void Set(const TIndex N, const T alpha, T* X, Context* context);
@@ -236,31 +218,28 @@ void Select(const int N, const int D, const T* x, const int* idx, T* y,
             Context* context);
 
 template <typename T, class Context>
-void Scale(const int N, const float alpha, const T* x, T* y, Context* context);
+void Scale(const int N, const T alpha, const T* x, T* y, Context* context);
 
 // Different from the Scale function above, if alpha is passed in
 // as a pointer, we will assume that it lives on the Context device,
 // for example on GPU.
 template <typename T, class Context>
-void Scale(const int N, const float* alpha, const T* x, T* y, Context* context);
+void Scale(const int N, const T* alpha, const T* x, T* y,
+           Context* context);
 
 template <typename T, class Context>
-void Axpy(const int N, const float alpha, const T* x, T* y, Context* context);
+void Axpy(const int N, const T alpha, const T* x, T* y, Context* context);
 
 // Different from the Axpy function above, if alpha is passed in
 // as a pointer, we will assume that it lives on the Context device,
 // for example on GPU.
 template <typename T, class Context>
-void Axpy(const int N, const float* alpha, const T* x, T* y, Context* context);
+void Axpy(const int N, const T* alpha, const T* x, T* y,
+          Context* context);
 
 template <typename T, class Context>
-void Axpby(
-    const int N,
-    const float alpha,
-    const T* x,
-    const T b,
-    T* y,
-    Context* context);
+void Axpby(const int N, const T alpha, const T* x, const T b, T* y,
+           Context* context);
 
 template <typename T, class Context, int order>
 void Im2colNd(
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
index e4340dfaf1..5cac0c8339 100644
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@@ -58,18 +58,9 @@ namespace math {
 // CblasTrans, respectively, for each of A and B.
 template <>
 void Gemm<float, CPUContext>(
-    const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha,
-    const float* A,
-    const float* B,
-    const float beta,
-    float* C,
-    CPUContext* context,
-    TensorProto::DataType math_type) {
+    const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+    const int M, const int N, const int K, const float alpha, const float* A,
+    const float* B, const float beta, float* C, CPUContext* context) {
   auto C_mat = EigenMatrixMap<float>(C, N, M);
   if (beta == 0) {
     C_mat.setZero();
@@ -187,8 +178,7 @@ void Gemv<float, CPUContext>(
     const float* x,
     const float beta,
     float* y,
-    CPUContext* context,
-    TensorProto::DataType math_type) {
+    CPUContext* context) {
   EigenVectorMap<float> y_vec(y, TransA == CblasNoTrans ? M : N);
   if (beta == 0) {
     // In Caffe2 we often do a lazy initialization, which may contain NaNs in
@@ -215,22 +205,19 @@ void Gemv<float, CPUContext>(
   }
 }
 
-#define CAFFE2_SPECIALIZED_SCALE(T)                                            \
-  template <>                                                                  \
-  void Scale<T, CPUContext>(                                                   \
-      const int n, const float alpha, const T* x, T* y, CPUContext* context) { \
-    EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * alpha;            \
-  }                                                                            \
-  template <>                                                                  \
-  void Scale<T, CPUContext>(                                                   \
-      const int n,                                                             \
-      const float* alpha,                                                      \
-      const T* x,                                                              \
-      T* y,                                                                    \
-      CPUContext* context) {                                                   \
-    EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * (*alpha);         \
+#define CAFFE2_SPECIALIZED_SCALE(T)                                         \
+  template <>                                                               \
+  void Scale<T, CPUContext>(                                                \
+      const int n, const T alpha, const T* x, T* y, CPUContext* context) {  \
+    EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * alpha;         \
+  }                                                                         \
+  template <>                                                               \
+  void Scale<T, CPUContext>(                                                \
+      const int n, const T* alpha, const T* x, T* y, CPUContext* context) { \
+    EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * (*alpha);      \
   }
 CAFFE2_SPECIALIZED_SCALE(float)
+CAFFE2_SPECIALIZED_SCALE(double)
 #undef CAFFE2_SPECIALIZED_SCALE
 
 #define CAFFE2_SPECIALIZED_DOT(T)                                              \
@@ -241,6 +228,7 @@ void Dot<T, CPUContext>(                                                       \
   *y = ConstEigenVectorMap<T>(a, N).dot(ConstEigenVectorMap<T>(b, N));         \
 }
 CAFFE2_SPECIALIZED_DOT(float)
+CAFFE2_SPECIALIZED_DOT(double)
 #undef CAFFE2_SPECIALIZED_DOT
 
 #define CAFFE2_SPECIALIZED_AXPY(T)                                          \
@@ -255,6 +243,7 @@ CAFFE2_SPECIALIZED_DOT(float)
     EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * (*alpha);     \
   }
 CAFFE2_SPECIALIZED_AXPY(float)
+CAFFE2_SPECIALIZED_AXPY(double)
 #undef CAFFE2_SPECIALIZED_AXPY
 
 #define CAFFE2_SPECIALIZED_AXPBY(T)                                            \
@@ -265,24 +254,16 @@ void Axpby<T, CPUContext>(const int N, const T alpha, const T* x,              \
   y_vec = y_vec * beta + ConstEigenVectorMap<T>(x, N) * alpha;                 \
 }
 CAFFE2_SPECIALIZED_AXPBY(float)
+CAFFE2_SPECIALIZED_AXPBY(double)
 #undef CAFFE2_SPECIALIZED_AXPBY
 
 #else  // CAFFE2_USE_EIGEN_FOR_BLAS
 
 template <>
 void Gemm<float, CPUContext>(
-    const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha,
-    const float* A,
-    const float* B,
-    const float beta,
-    float* C,
-    CPUContext* context,
-    TensorProto::DataType math_type) {
+    const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+    const int M, const int N, const int K, const float alpha, const float* A,
+    const float* B, const float beta, float* C, CPUContext* context) {
   int lda = (TransA == CblasNoTrans) ? K : M;
   int ldb = (TransB == CblasNoTrans) ? N : K;
   cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
@@ -311,39 +292,29 @@ void GemmEx<float, CPUContext>(
 
 template <>
 void Gemv<float, CPUContext>(
-    const CBLAS_TRANSPOSE TransA,
-    const int M,
-    const int N,
-    const float alpha,
-    const float* A,
-    const float* x,
-    const float beta,
-    float* y,
-    CPUContext* context,
-    TensorProto::DataType math_type) {
+    const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha,
+    const float* A, const float* x, const float beta, float* y,
+    CPUContext* context) {
   cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
 }
 
-#define CAFFE2_SPECIALIZED_SCALE(T, prefix)                                    \
-  template <>                                                                  \
-  void Scale<T, CPUContext>(                                                   \
-      const int n, const float alpha, const T* x, T* y, CPUContext* context) { \
-    if (y != x)                                                                \
-      cblas_##prefix##copy(n, x, 1, y, 1);                                     \
-    cblas_##prefix##scal(n, static_cast<float>(alpha), y, 1);                  \
-  }                                                                            \
-  template <>                                                                  \
-  void Scale<T, CPUContext>(                                                   \
-      const int n,                                                             \
-      const float* alpha,                                                      \
-      const T* x,                                                              \
-      T* y,                                                                    \
-      CPUContext* context) {                                                   \
-    if (y != x)                                                                \
-      cblas_##prefix##copy(n, x, 1, y, 1);                                     \
-    cblas_##prefix##scal(n, static_cast<float>(*alpha), y, 1);                 \
+#define CAFFE2_SPECIALIZED_SCALE(T, prefix)                                 \
+  template <>                                                               \
+  void Scale<T, CPUContext>(                                                \
+      const int n, const T alpha, const T* x, T* y, CPUContext* context) {  \
+    if (y != x)                                                             \
+      cblas_##prefix##copy(n, x, 1, y, 1);                                  \
+    cblas_##prefix##scal(n, alpha, y, 1);                                   \
+  }                                                                         \
+  template <>                                                               \
+  void Scale<T, CPUContext>(                                                \
+      const int n, const T* alpha, const T* x, T* y, CPUContext* context) { \
+    if (y != x)                                                             \
+      cblas_##prefix##copy(n, x, 1, y, 1);                                  \
+    cblas_##prefix##scal(n, *alpha, y, 1);                                  \
   }
 CAFFE2_SPECIALIZED_SCALE(float, s)
+CAFFE2_SPECIALIZED_SCALE(double, d)
 #undef CAFFE2_SPECIALIZED_SCALE
 
 #define CAFFE2_SPECIALIZED_DOT(T, prefix)                                      \
@@ -354,6 +325,7 @@ void Dot<T, CPUContext>(                                                       \
   *y = cblas_##prefix##dot(N, a, 1, b, 1);                                     \
 }
 CAFFE2_SPECIALIZED_DOT(float, s)
+CAFFE2_SPECIALIZED_DOT(double, d)
 #undef CAFFE2_SPECIALIZED_DOT
 
 #define CAFFE2_SPECIALIZED_AXPY(T, prefix)                                  \
@@ -368,6 +340,7 @@ CAFFE2_SPECIALIZED_DOT(float, s)
     cblas_##prefix##axpy(N, *alpha, x, 1, y, 1);                            \
   }
 CAFFE2_SPECIALIZED_AXPY(float, s)
+CAFFE2_SPECIALIZED_AXPY(double, d)
 #undef CAFFE2_SPECIALIZED_AXPY
 
 // cblas_[sd]axpby is not a standard blas function, and if MKL is not present,
@@ -389,6 +362,7 @@ void Axpby<T, CPUContext>(const int N, const T alpha, const T* x,              \
 }
 #endif  // CAFFE2_USE_MKL
 CAFFE2_SPECIALIZED_AXPBY(float, s)
+CAFFE2_SPECIALIZED_AXPBY(double, d)
 #undef CAFFE2_SPECIALIZED_AXPBY
 
 #endif  // CAFFE2_USE_EIGEN_FOR_BLAS
@@ -462,8 +436,11 @@ void Funcname<T, CPUContext>(const int N, const T* x, T* y,                    \
   EigenVectorMap<T>(y, N) = ConstEigenVectorMap<T>(x, N).array().expr();       \
 }
 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Exp, exp)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Exp, exp)
 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, log)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log, log)
 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, square)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqr, square)
 #undef DELEGATE_SIMPLE_UNARY_FUNCTION
 
 #define DELEGATE_POWX_FUNCTION(T)                                              \
@@ -473,6 +450,7 @@ void Powx<T, CPUContext>(                                                      \
   EigenVectorMap<T>(y, N) = ConstEigenVectorMap<T>(a, N).array().pow(b);       \
 }
 DELEGATE_POWX_FUNCTION(float)
+DELEGATE_POWX_FUNCTION(double)
 #undef DELEGATE_POWX_FUNCTION
 
 #endif  // CAFFE2_USE_MKL
@@ -498,6 +476,7 @@ EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr)
 
 #define DEFINE_SIMPLE_BINARY_FUNCTION(Funcname, expr)                          \
 EIGEN_SIMPLE_BINARY_FUNCTION(float, Funcname, expr)                            \
+EIGEN_SIMPLE_BINARY_FUNCTION(double, Funcname, expr)                           \
 EIGEN_SIMPLE_BINARY_FUNCTION(int32_t, Funcname, expr)                          \
 EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr)
 
@@ -567,6 +546,7 @@ CAFFE2_SPECIALIZED_COLWISEMAX(float)
   DELEGATE_BROADCAST_BINARY_FUNCTION(int32_t, name, op)                  \
   DELEGATE_BROADCAST_BINARY_FUNCTION(int64_t, name, op)                  \
   DELEGATE_BROADCAST_BINARY_FUNCTION(float, name, op)                    \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(double, name, op)
 
 DEFINE_BROADCAST_BINARY_FUNCTION(Add, +)
 DEFINE_BROADCAST_BINARY_FUNCTION(Sub, -)
@@ -622,6 +602,7 @@ CAFFE2_SPECIALIZED_SET(uint16_t);
 
 #define CAFFE2_DEFINE_BINARY_OP(name, op)         \
   CAFFE2_INSTANTIATE_BINARY_OP(name, op, float)   \
+  CAFFE2_INSTANTIATE_BINARY_OP(name, op, double)  \
   CAFFE2_INSTANTIATE_BINARY_OP(name, op, int32_t) \
   CAFFE2_INSTANTIATE_BINARY_OP(name, op, int64_t)
 
@@ -663,6 +644,7 @@ void Not<bool, CPUContext>(
   }
 
 CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(float);
+CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(double);
 #undef CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH
 
 template <>
@@ -735,6 +717,7 @@ void RandGaussian<float, CPUContext>(
   }
 
 CAFFE2_SPECIALIZED_SUM(float);
+CAFFE2_SPECIALIZED_SUM(double);
 CAFFE2_SPECIALIZED_SUM(int32_t);
 CAFFE2_SPECIALIZED_SUM(int64_t);
 
diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
index 46c5bc02f1..cfbe91b5d0 100644
--- a/caffe2/utils/math_gpu.cu
+++ b/caffe2/utils/math_gpu.cu
@@ -5,9 +5,8 @@
 #include <thrust/system/cuda/detail/par.h>
 #include <thrust/version.h>
 
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/utils/conversions.h"
 #include "caffe2/utils/math.h"
+#include "caffe2/core/context_gpu.h"
 
 #if THRUST_VERSION >= 100800
 #define THRUST_SUPPORTS_PER_THREAD
@@ -33,30 +32,33 @@ void Funcname<T, CUDAContext>(                                                 \
 }
 
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp, expf);
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Exp, exp);
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log, logf);
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Log, log);
 
 __device__ float cuda_sqrf(const float x) { return x * x; }
+__device__ double cuda_sqr(const double x) { return x * x; }
 
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr, cuda_sqrf);
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Sqr, cuda_sqr);
 
 #undef DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION
 
-#define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(T, Funcname, expr)               \
-  __global__ void _Kernel_##T##_##Funcname(                                   \
-      const int N, const T* a, const T* b, T* y) {                            \
-    CUDA_1D_KERNEL_LOOP(i, N) {                                               \
-      float r = convert::To<T, float>(a[i]) expr convert::To<T, float>(b[i]); \
-      y[i] = convert::To<float, T>(r);                                        \
-    }                                                                         \
-  }                                                                           \
-  template <>                                                                 \
-  void Funcname<T, CUDAContext>(                                              \
-      const int N, const T* a, const T* b, T* y, CUDAContext* context) {      \
-    _Kernel_##T##_##Funcname<<<                                               \
-        CAFFE_GET_BLOCKS(N),                                                  \
-        CAFFE_CUDA_NUM_THREADS,                                               \
-        0,                                                                    \
-        context->cuda_stream()>>>(N, a, b, y);                                \
+#define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(T, Funcname, expr)          \
+  __global__ void _Kernel_##T##_##Funcname(                              \
+      const int N, const T* a, const T* b, T* y) {                       \
+    CUDA_1D_KERNEL_LOOP(i, N) {                                          \
+      y[i] = a[i] expr b[i];                                             \
+    }                                                                    \
+  }                                                                      \
+  template <>                                                            \
+  void Funcname<T, CUDAContext>(                                         \
+      const int N, const T* a, const T* b, T* y, CUDAContext* context) { \
+    _Kernel_##T##_##Funcname<<<                                          \
+        CAFFE_GET_BLOCKS(N),                                             \
+        CAFFE_CUDA_NUM_THREADS,                                          \
+        0,                                                               \
+        context->cuda_stream()>>>(N, a, b, y);                           \
   }
 
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Add, +);
@@ -64,27 +66,13 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Sub, -);
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Mul, *);
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Div, /);
 
-DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, Add, +);
-DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, Sub, -);
-DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, Mul, *);
-DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, Div, /);
-
 // Caffe2 gemm provides a simpler interface to the gemm functions, with the
 // limitation that the data has to be contiguous in memory.
 template <>
 void Gemm<float, CUDAContext>(
-    const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha,
-    const float* A,
-    const float* B,
-    const float beta,
-    float* C,
-    CUDAContext* context,
-    TensorProto::DataType math_type) {
+    const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+    const int M, const int N, const int K, const float alpha, const float* A,
+    const float* B, const float beta, float* C, CUDAContext* context) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (TransA == CblasNoTrans) ? K : M;
@@ -110,92 +98,12 @@ void Gemm<float, CUDAContext>(
       N));
 }
 
-template <>
-void Gemm<float16, CUDAContext>(
-    const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha,
-    const float16* A,
-    const float16* B,
-    const float beta,
-    float16* C,
-    CUDAContext* context,
-    TensorProto::DataType math_type) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (TransA == CblasNoTrans) ? K : M;
-  int ldb = (TransB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  if (math_type == TensorProto_DataType_FLOAT) {
-    CUBLAS_CHECK(cublasSgemmEx(
-        context->cublas_handle(),
-        cuTransB,
-        cuTransA,
-        N,
-        M,
-        K,
-        &alpha,
-        B,
-        CUDA_R_16F,
-        ldb,
-        A,
-        CUDA_R_16F,
-        lda,
-        &beta,
-        C,
-        CUDA_R_16F,
-        N));
-
-  } else if (math_type == TensorProto_DataType_FLOAT16) {
-    // convert alpha, beta from caffe2::float16 -> __half
-    __half alpha_fp16;
-    alpha_fp16.x = convert::To<float, float16>(alpha).x;
-    __half beta_fp16;
-    beta_fp16.x = convert::To<float, float16>(beta).x;
-    // call cublasHgemm
-    CUBLAS_CHECK(cublasHgemm(
-        context->cublas_handle(),
-        cuTransB,
-        cuTransA,
-        N,
-        M,
-        K,
-        &alpha_fp16,
-        (const __half*)B,
-        ldb,
-        (const __half*)A,
-        lda,
-        &beta_fp16,
-        (__half*)C,
-        N));
-  } else {
-    // fail
-    CAFFE_THROW("Unsupported math type");
-  }
-}
-
 template <>
 void GemmEx<float, CUDAContext>(
-    const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha,
-    const float* A,
-    const int lda,
-    const float* B,
-    const int ldb,
-    const float beta,
-    float* C,
-    const int ldc,
-    CUDAContext* context) {
+    const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+    const int M, const int N, const int K, const float alpha, const float* A,
+    const int lda, const float* B, const int ldb, const float beta, float* C,
+    const int ldc, CUDAContext* context) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   cublasOperation_t cuTransA =
@@ -221,19 +129,40 @@ void GemmEx<float, CUDAContext>(
 
 template <>
 void Gemv<float, CUDAContext>(
+    const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha,
+    const float* A, const float* x, const float beta, float* y,
+    CUDAContext* context) {
+  cublasOperation_t cuTransA =
+      (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBLAS_ENFORCE(cublasSgemv(
+      context->cublas_handle(),
+      cuTransA,
+      N,
+      M,
+      &alpha,
+      A,
+      N,
+      x,
+      1,
+      &beta,
+      y,
+      1));
+}
+
+template <>
+void Gemv<double, CUDAContext>(
     const CBLAS_TRANSPOSE TransA,
     const int M,
     const int N,
-    const float alpha,
-    const float* A,
-    const float* x,
-    const float beta,
-    float* y,
-    CUDAContext* context,
-    TensorProto::DataType math_type) {
+    const double alpha,
+    const double* A,
+    const double* x,
+    const double beta,
+    double* y,
+    CUDAContext* context) {
   cublasOperation_t cuTransA =
       (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_ENFORCE(cublasSgemv(
+  CUBLAS_ENFORCE(cublasDgemv(
       context->cublas_handle(),
       cuTransA,
       N,
@@ -287,73 +216,6 @@ CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float);
 CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(double);
 #undef CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH
 
-template <>
-void Gemv<float16, CUDAContext>(
-    const CBLAS_TRANSPOSE TransA,
-    const int M,
-    const int N,
-    const float alpha,
-    const float16* A,
-    const float16* x,
-    const float beta,
-    float16* y,
-    CUDAContext* context,
-    TensorProto::DataType math_type) {
-  cublasOperation_t cuTransA =
-      (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-  // sort out what we need to call cublasSgemmEx / cublasHgemm
-  int m = (cuTransA == CUBLAS_OP_N) ? N : M;
-  int k = (cuTransA == CUBLAS_OP_N) ? M : N;
-  int LDA = (cuTransA == CUBLAS_OP_N) ? m : k;
-  int LDC = m;
-
-  if (math_type == TensorProto_DataType_FLOAT) {
-    CUBLAS_CHECK(cublasSgemmEx(
-        context->cublas_handle(),
-        cuTransA,
-        CUBLAS_OP_N,
-        m,
-        1,
-        k,
-        &alpha,
-        A,
-        CUDA_R_16F,
-        LDA,
-        x,
-        CUDA_R_16F,
-        k,
-        &beta,
-        y,
-        CUDA_R_16F,
-        LDC));
-  } else if (math_type == TensorProto_DataType_FLOAT16) {
-    __half alpha_fp16;
-    alpha_fp16.x = convert::To<float, float16>(alpha).x;
-    __half beta_fp16;
-    beta_fp16.x = convert::To<float, float16>(beta).x;
-
-    CUBLAS_CHECK(cublasHgemm(
-        context->cublas_handle(),
-        cuTransA,
-        CUBLAS_OP_N,
-        m,
-        1,
-        k,
-        &alpha_fp16,
-        (const __half*)A,
-        LDA,
-        (const __half*)x,
-        k,
-        &beta_fp16,
-        (__half*)y,
-        LDC));
-  } else {
-    // fail
-    CAFFE_THROW("Unsupported math type");
-  }
-}
-
 namespace {
 template <typename T>
 __global__ void SetKernel(const int N, const T alpha, T* Y) {
@@ -376,7 +238,6 @@ CAFFE2_SPECIALIZED_CUDA_SET(double);
 CAFFE2_SPECIALIZED_CUDA_SET(bool);
 CAFFE2_SPECIALIZED_CUDA_SET(int8_t);
 CAFFE2_SPECIALIZED_CUDA_SET(int16_t);
-CAFFE2_SPECIALIZED_CUDA_SET(float16);
 CAFFE2_SPECIALIZED_CUDA_SET(int);
 CAFFE2_SPECIALIZED_CUDA_SET(int64_t);
 CAFFE2_SPECIALIZED_CUDA_SET(char);
@@ -386,11 +247,11 @@ CAFFE2_SPECIALIZED_CUDA_SET(uint16_t);
 
 namespace {
 template <typename T>
-__global__ void
-UniformShift(const int N, const float min, const float max, T* x) {
-  float scale = max - min;
+__global__ void UniformShift(const int N, const T min, const T max,
+                             T* x) {
+  T scale = max - min;
   CUDA_1D_KERNEL_LOOP(i, N) {
-    x[i] = convert::To<float, T>(convert::To<T, float>(x[i]) * scale + min);
+    x[i] = x[i] * scale + min;
   }
 }
 
@@ -475,6 +336,7 @@ void RandGaussian<double, CUDAContext>(
       context->curand_generator(), r, even_n, mean, std));
 }
 
+
 template<>
 void Dot<float, CUDAContext>(
     const int n, const float* a, const float* b, float* y,
@@ -484,28 +346,13 @@ void Dot<float, CUDAContext>(
   context->Copy<float, CPUContext, CUDAContext>(1, &result, y);
 }
 
-template <>
-void Dot<float16, CUDAContext>(
-    const int n,
-    const float16* a,
-    const float16* b,
-    float16* y,
+template<>
+void Dot<double, CUDAContext>(
+    const int n, const double* a, const double* b, double* y,
     CUDAContext* context) {
-  float16 result;
-  // execute with 32-bit math
-  CUBLAS_CHECK(cublasDotEx(
-      context->cublas_handle(),
-      n,
-      a,
-      CUDA_R_16F,
-      1,
-      b,
-      CUDA_R_16F,
-      1,
-      &result,
-      CUDA_R_16F,
-      CUDA_R_32F));
-  context->Copy<float16, CPUContext, CUDAContext>(1, &result, y);
+  double result;
+  CUBLAS_ENFORCE(cublasDdot(context->cublas_handle(), n, a, 1, b, 1, y));
+  context->Copy<double, CPUContext, CUDAContext>(1, &result, y);
 }
 
 // A previous version of caffe2 used Thrust but it turns out that thrust
@@ -516,7 +363,7 @@ void Dot<float16, CUDAContext>(
 template <typename T>
 __global__ void SumKernel(const int N, const T* X, T* Y, bool square) {
   const int idx = threadIdx.x;
-  __shared__ float reduction_buffer[SUM_KERNEL_NTHREADS];
+  __shared__ T reduction_buffer[SUM_KERNEL_NTHREADS];
 
   reduction_buffer[idx] = 0;
 
@@ -524,12 +371,11 @@ __global__ void SumKernel(const int N, const T* X, T* Y, bool square) {
   // N -> 128
   if (!square) {
     for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) {
-      reduction_buffer[idx] += convert::To<T, float>(X[i]);
+      reduction_buffer[idx] += X[i];
     }
   } else {
     for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) {
-      float Xi = convert::To<T, float>(X[i]);
-      reduction_buffer[idx] += Xi * Xi;
+      reduction_buffer[idx] += X[i] * X[i];
     }
   }
   __syncthreads();
@@ -547,7 +393,7 @@ __global__ void SumKernel(const int N, const T* X, T* Y, bool square) {
     for (int i = 0; i < 32; ++i) {
       tmp += reduction_buffer[i];
     }
-    *Y = convert::To<float, T>(tmp);
+    *Y = tmp;
   }
 }
 
@@ -560,7 +406,7 @@ __global__ void SumKernel(const int N, const T* X, T* Y, bool square) {
   }
 
 CAFFE2_MATH_SUM_FUNC(float)
-CAFFE2_MATH_SUM_FUNC(float16)
+CAFFE2_MATH_SUM_FUNC(double)
 #undef CAFFE2_MATH_SUM_FUNC
 
 #define CAFFE2_MATH_SUMSQR_FUNC(T)                                    \
@@ -592,33 +438,18 @@ void Select<float, CUDAContext>(
                         0, context->cuda_stream()>>>(N, D, x, idx, y);
 }
 
-template <>
-void Select<float16, CUDAContext>(
-    const int N,
-    const int D,
-    const float16* x,
-    const int* idx,
-    float16* y,
-    CUDAContext* context) {
-  SelectKernel<float16><<<
-      CAFFE_GET_BLOCKS(N),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context->cuda_stream()>>>(N, D, x, idx, y);
-}
-
 namespace {
 template <typename T>
-__global__ void ScaleKernel(const int n, const float alpha, const T* x, T* y) {
+__global__ void ScaleKernel(
+    const int n, const T alpha, const T* x, T* y) {
   CUDA_1D_KERNEL_LOOP(i, n) {
-    // y[i] = convert::To<float,T>(convert::To<T, float>(x[i]) * alpha);
-    y[i] = convert::Get<T>(convert::Get<float>(x[i]) * alpha);
+    y[i] = x[i] * alpha;
   }
 }
 
 template <typename T>
-__global__ void
-ScaleKernelDeviceAlpha(const int n, const float* alpha, const T* x, T* y) {
+__global__ void ScaleKernelDeviceAlpha(
+    const int n, const T* alpha, const T* x, T* y) {
   CUDA_1D_KERNEL_LOOP(i, n) {
     y[i] = x[i] * (*alpha);
   }
@@ -630,20 +461,6 @@ __global__ void PowKernel(const int n, const T* x, const T exponent, T* y) {
     y[i] = powf(x[i], exponent);
   }
 }
-
-// fp16 specialization
-template <>
-__global__ void ScaleKernelDeviceAlpha(
-    const int n,
-    const float* alpha,
-    const float16* x,
-    float16* y) {
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    y[i] = convert::To<float, float16>(
-        convert::To<float16, float>(x[i]) * (*alpha));
-  }
-}
-
 }  // namespace
 
 template <>
@@ -672,17 +489,12 @@ void Scale<float, CUDAContext>(
 }
 
 template <>
-void Scale<float16, CUDAContext>(
-    const int n,
-    const float alpha,
-    const float16* x,
-    float16* y,
+void Scale<double, CUDAContext>(
+    const int n, const double alpha, const double *x, double* y,
     CUDAContext* context) {
-  ScaleKernel<float16><<<
-      CAFFE_GET_BLOCKS(n),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context->cuda_stream()>>>(n, alpha, x, y);
+  ScaleKernel<double><<<
+      CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
+          n, alpha, x, y);
 }
 
 template <>
@@ -695,17 +507,11 @@ void Scale<float, CUDAContext>(
 }
 
 template <>
-void Scale<float16, CUDAContext>(
-    const int n,
-    const float* alpha,
-    const float16* x,
-    float16* y,
+void Scale<double, CUDAContext>(
+    const int n, const double* alpha, const double *x, double* y,
     CUDAContext* context) {
-  ScaleKernelDeviceAlpha<float16><<<
-      CAFFE_GET_BLOCKS(n),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context->cuda_stream()>>>(n, alpha, x, y);
+  ScaleKernelDeviceAlpha<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS,
+                       0, context->cuda_stream()>>>(n, alpha, x, y);
 }
 
 template <>
@@ -721,42 +527,18 @@ void Axpy<float, CUDAContext>(
 template <>
 void Axpy<double, CUDAContext>(
     const int N,
-    const float alpha,
+    const double alpha,
     const double* X,
     double* Y,
     CUDAContext* context) {
-  double alpha_d{alpha};
-  CUBLAS_ENFORCE(
-      cublasDaxpy(context->cublas_handle(), N, &alpha_d, X, 1, Y, 1));
-}
-
-template <>
-void Axpy<float16, CUDAContext>(
-    const int N,
-    const float alpha,
-    const float16* X,
-    float16* Y,
-    CUDAContext* context) {
-  CUBLAS_CHECK(cublasAxpyEx(
-      context->cublas_handle(),
-      N,
-      &alpha,
-      CUDA_R_16F,
-      X,
-      CUDA_R_16F,
-      1,
-      Y,
-      CUDA_R_16F,
-      1,
-      CUDA_R_32F));
+  CUBLAS_ENFORCE(cublasDaxpy(context->cublas_handle(), N, &alpha, X, 1, Y, 1));
 }
 
 namespace {
 template <typename T>
-__global__ void AxpyKernel(const int n, const float* a, const T* x, T* y) {
+__global__ void AxpyKernel(const int n, const T* a, const T* x, T* y) {
   CUDA_1D_KERNEL_LOOP(index, n) {
-    y[index] = convert::Get<T>(
-        convert::Get<float>(x[index]) * (*a) + convert::Get<float>(y[index]));
+    y[index] += x[index] * (*a);
   }
 }
 }  // namespace
@@ -770,19 +552,14 @@ void Axpy<float, CUDAContext>(
 }
 
 template <>
-void Axpy<float16, CUDAContext>(
-    const int n,
-    const float* alpha,
-    const float16* X,
-    float16* Y,
-    CUDAContext* context) {
-  AxpyKernel<float16><<<
-      CAFFE_GET_BLOCKS(n),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context->cuda_stream()>>>(n, alpha, X, Y);
+void Axpy<double, CUDAContext>(
+    const int n, const double* alpha, const double* X,
+    double* Y, CUDAContext* context) {
+  AxpyKernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS,
+                       0, context->cuda_stream()>>>(n, alpha, X, Y);
 }
 
+
 namespace {
 template <typename T>
 __global__ void AxpbyKernel(const int n, const T a, const T* x,
@@ -801,6 +578,14 @@ void Axpby<float, CUDAContext>(
                        0, context->cuda_stream()>>>(n, a, x, b, y);
 }
 
+template <>
+void Axpby<double, CUDAContext>(
+    const int n, const double a, const double* x, const double b, double* y,
+    CUDAContext* context) {
+  AxpbyKernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS,
+                        0, context->cuda_stream()>>>(n, a, x, b, y);
+}
+
 namespace {
 
 template <typename T>
diff --git a/caffe2/utils/math_gpu_test.cc b/caffe2/utils/math_gpu_test.cc
index b1f930bec0..2ceeddd355 100644
--- a/caffe2/utils/math_gpu_test.cc
+++ b/caffe2/utils/math_gpu_test.cc
@@ -67,4 +67,61 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
   }
 }
 
+#define TEST_GEMV_WITH_TYPE(field_name)                                      \
+  TEST(MathUtilGPUTest, testGemv_##field_name) {                             \
+    if (!HasCudaGPU())                                                       \
+      return;                                                                \
+    Workspace ws;                                                            \
+    DeviceOption option;                                                     \
+    option.set_device_type(CUDA);                                            \
+    CUDAContext context(option);                                             \
+    Blob* blobx = ws.CreateBlob("X");                                        \
+    Blob* bloby = ws.CreateBlob("Y");                                        \
+    Blob* blobz = ws.CreateBlob("Z");                                        \
+    Blob* bloby_host = ws.CreateBlob("Y_host");                              \
+                                                                             \
+    vector<int> shapex{64, 128};                                             \
+    vector<int> shapey{64};                                                  \
+    vector<int> shapez{128};                                                 \
+                                                                             \
+    auto* tensorx = blobx->GetMutable<Tensor<CUDAContext>>();                \
+    tensorx->Resize(shapex);                                                 \
+    math::Set<field_name, CUDAContext>(                                      \
+        64 * 128,                                                            \
+        (field_name)1.0,                                                     \
+        tensorx->mutable_data<field_name>(),                                 \
+        &context);                                                           \
+                                                                             \
+    auto* tensory = bloby->GetMutable<Tensor<CUDAContext>>();                \
+    tensory->Resize(shapey);                                                 \
+    math::Set<field_name, CUDAContext>(                                      \
+        64, (field_name)1.0, tensory->mutable_data<field_name>(), &context); \
+                                                                             \
+    auto* tensorz = blobz->GetMutable<Tensor<CUDAContext>>();                \
+    tensorz->Resize(shapez);                                                 \
+                                                                             \
+    math::Gemv<field_name, CUDAContext>(                                     \
+        CblasTrans,                                                          \
+        64,                                                                  \
+        128,                                                                 \
+        1.0,                                                                 \
+        tensorx->template data<field_name>(),                                \
+        tensory->mutable_data<field_name>(),                                 \
+        0.0,                                                                 \
+        tensorz->template mutable_data<field_name>(),                        \
+        &context);                                                           \
+    context.FinishDeviceComputation();                                       \
+                                                                             \
+    auto* tensory_host = bloby_host->GetMutable<Tensor<CPUContext>>();       \
+    tensory_host->CopyFrom<CUDAContext, CUDAContext>(*tensorz, &context);    \
+    context.FinishDeviceComputation();                                       \
+                                                                             \
+    for (int i = 0; i < 128; i++) {                                          \
+      EXPECT_EQ(tensory_host->data<field_name>()[i], 64.0);                  \
+    }                                                                        \
+  }
+
+TEST_GEMV_WITH_TYPE(float);
+TEST_GEMV_WITH_TYPE(double);
+
 } // namespace caffe2
-- 
cgit v1.2.3